{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T03:59:55Z","timestamp":1774929595576,"version":"3.50.1"},"reference-count":42,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/2.zoppoz.workers.dev:443\/https\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/2.zoppoz.workers.dev:443\/https\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/2.zoppoz.workers.dev:443\/https\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,10]]},"DOI":"10.1109\/iccvw54120.2021.00355","type":"proceedings-article","created":{"date-parts":[[2021,11,24]],"date-time":"2021-11-24T20:40:09Z","timestamp":1637786409000},"page":"3156-3165","source":"Crossref","is-referenced-by-count":411,"title":["Video Transformer Network"],"prefix":"10.1109","author":[{"given":"Daniel","family":"Neimark","sequence":"first","affiliation":[]},{"given":"Omri","family":"Bar","sequence":"additional","affiliation":[]},{"given":"Maya","family":"Zohar","sequence":"additional","affiliation":[]},{"given":"Dotan","family":"Asselmann","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref38","first-page":"20","article-title":"Temporal segment networks: Towards good practices for deep action recognition","author":"wang","year":"2016","journal-title":"European Conference on Computer Vision"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.220"},{"key":"ref32","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"International Conference on Learning Representations"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00058"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"ref37","first-page":"5998","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref36","first-page":"4489","article-title":"Learning spatiotemporal features with 3d convolutional networks","author":"tran","year":"2015","journal-title":"Proceedings of the IEEE International Conference on Computer Vision"},{"key":"ref35","article-title":"Francisco Massa, Alexandre Sablayrolles, and Herv&#x00E9; J&#x00E9;gou. Training data-efficient image transformers & distillation through attention","author":"touvron","year":"2020"},{"key":"ref34","first-page":"6105","article-title":"Efficientnet: Rethinking model scaling for convolutional neural networks","author":"tan","year":"2019","journal-title":"International Conference on Machine Learning"},{"key":"ref10","article-title":"Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"International Conference on Learning Representations"},{"key":"ref40","article-title":"End-to-end video instance segmentation with transformers","author":"wang","year":"2020"},{"key":"ref11","author":"fan","year":"2020","journal-title":"Pyslowfast"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58529-7_30"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00033"},{"key":"ref16","first-page":"1440","article-title":"Fast r-cnn","author":"girshick","year":"2015","journal-title":"Proceedings of the IEEE International Conference on Computer Vision"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref4","first-page":"1130","article-title":"and Rahul Sukthankar. Rethinking the faster r-cnn architecture for temporal action localization","author":"chao","year":"2018","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref27","article-title":"Automatic differentiation in pytorch","author":"paszke","year":"2017"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"ref29","article-title":"Assemblenet: Searching for multi-stream neural connectivity in video architectures","author":"ryoo","year":"2020","journal-title":"International Conference on Learning Representations"},{"key":"ref5","first-page":"3468","article-title":"Spatiotemporal residual networks for video action recognition","author":"christoph","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref8","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologies Volume 1 (Long and Short Papers)"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref2","first-page":"213","article-title":"Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"European Conference on Computer Vision"},{"key":"ref9","first-page":"2625","article-title":"Marcus Rohrbach, Subhashini Venugopalan, Kate Saenko, and Trevor Darrell. Long-term recurrent convolutional networks for visual recognition and description","author":"donahue","year":"2015","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref1","article-title":"Longformer: The long-document transformer","author":"beltagy","year":"2020"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.59"},{"key":"ref22","first-page":"1097","article-title":"Imagenet classification with deep convolutional neural networks","volume":"25","author":"krizhevsky","year":"2012","journal-title":"Advances in neural information processing systems"},{"key":"ref21","article-title":"The kinetics human action video dataset","author":"kay","year":"2017"},{"key":"ref42","first-page":"8739","article-title":"Richard Socher, and Caiming Xiong. End-to-end dense video captioning with masked transformer","author":"zhou","year":"2018","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref24","article-title":"Roberta: A robustly optimized bert pretraining approach","author":"liu","year":"2019"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00037"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2901464"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"}],"event":{"name":"2021 IEEE\/CVF International Conference on Computer Vision Workshops (ICCVW)","location":"Montreal, BC, Canada","start":{"date-parts":[[2021,10,11]]},"end":{"date-parts":[[2021,10,17]]}},"container-title":["2021 IEEE\/CVF International Conference on Computer Vision Workshops (ICCVW)"],"original-title":[],"link":[{"URL":"https:\/\/2.zoppoz.workers.dev:443\/http\/xplorestaging.ieee.org\/ielx7\/9607382\/9607383\/09607406.pdf?arnumber=9607406","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T16:51:41Z","timestamp":1652201501000},"score":1,"resource":{"primary":{"URL":"https:\/\/2.zoppoz.workers.dev:443\/https\/ieeexplore.ieee.org\/document\/9607406\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10]]},"references-count":42,"URL":"https:\/\/2.zoppoz.workers.dev:443\/https\/doi.org\/10.1109\/iccvw54120.2021.00355","relation":{},"subject":[],"published":{"date-parts":[[2021,10]]}}}