{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T18:04:57Z","timestamp":1768327497617,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":46,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819557394","type":"print"},{"value":"9789819557400","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/2.zoppoz.workers.dev:443\/https\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/2.zoppoz.workers.dev:443\/https\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5740-0_25","type":"book-chapter","created":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T12:23:22Z","timestamp":1768220602000},"page":"360-373","update-policy":"https:\/\/2.zoppoz.workers.dev:443\/https\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MMVQA: Dual-Path Multimodal Fusion for\u00a0AI-Generated Video Quality Assessment"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/2.zoppoz.workers.dev:443\/https\/orcid.org\/0009-0005-1738-3209","authenticated-orcid":false,"given":"Yuhang","family":"Wu","sequence":"first","affiliation":[]},{"given":"Wuyuan","family":"Xie","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/2.zoppoz.workers.dev:443\/https\/orcid.org\/0000-0003-1125-9299","authenticated-orcid":false,"given":"Miaohui","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"25_CR1","unstructured":"Antkowiak, J., et al.: Final report from the video quality experts group on the validation of objective models of video quality assessment march 2000. Final Rep. Video Qual. Exp. Group Valid. Obj. Models Video Qual. Ass. March 2000 (2000)"},{"key":"25_CR2","doi-asserted-by":"crossref","unstructured":"Baeva, D., Ivanova, G.: Llm logical reasoning related to aesthetic universals. In: Proceedings of the Bulgarian Academy of Sciences, vol.\u00a077, pp. 1792\u20131800 (2024)","DOI":"10.7546\/CRABS.2024.12.07"},{"key":"25_CR3","first-page":"8","volume":"1","author":"T Brooks","year":"2024","unstructured":"Brooks, T., et al.: Video generation models as world simulators. OpenAI Blog 1, 8 (2024)","journal-title":"OpenAI Blog"},{"key":"25_CR4","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"25_CR5","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"25_CR6","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"25_CR7","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. Adv. Neural Inf. Process, Syst. 30 (2017)"},{"issue":"11","key":"25_CR8","doi-asserted-by":"publisher","first-page":"7386","DOI":"10.1109\/TCSVT.2022.3186307","volume":"32","author":"J Hou","year":"2022","unstructured":"Hou, J., Ding, H., Lin, W., Liu, W., Fang, Y.: Distilling knowledge from object classification to aesthetics assessment. IEEE Trans. Circuits Syst. Video Technol. 32(11), 7386\u20137402 (2022)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"25_CR9","unstructured":"Hou, J., et al.: Towards transparent deep image aesthetics assessment with tag-based content descriptors. IEEE Trans. Image Process. (2023)"},{"key":"25_CR10","doi-asserted-by":"crossref","unstructured":"Huang, Y., Wang, M.: An efficient quality assessment method for screen content image based on gabor. In: IEEE International Conference on Signal and Image Processing (ICSIP), pp. 201\u2013205 (2020)","DOI":"10.1109\/ICSIP49896.2020.9339420"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Huang, Y., et al.: Aesexpert: Towards multi-modality foundation model for image aesthetics perception. In: ACM International Conference on Multimedia (ACMMM), pp. 5911\u20135920 (2024)","DOI":"10.1145\/3664647.3680649"},{"key":"25_CR12","first-page":"36652","volume":"36","author":"Y Kirstain","year":"2023","unstructured":"Kirstain, Y., Polyak, A., Singer, U., Matiana, S., Penna, J., Levy, O.: Pick-a-pic: An open dataset of user preferences for text-to-image generation. Adv. Neural. Inf. Process. Syst. 36, 36652\u201336663 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"25_CR13","doi-asserted-by":"crossref","unstructured":"Korhonen, J., Su, Y., You, J.: Blind natural video quality prediction via statistical temporal features and deep spatial features. In: ACM International Conference on Multimedia (ACMMM), pp. 3311\u20133319 (2020)","DOI":"10.1145\/3394171.3413845"},{"key":"25_CR14","doi-asserted-by":"crossref","unstructured":"Kou, T., et al.: Subjective-aligned dataset and metric for text-to-video quality assessment. In: ACM International Conference on Multimedia (ACMMM), pp. 7793\u20137802 (2024)","DOI":"10.1145\/3664647.3680868"},{"issue":"9","key":"25_CR15","doi-asserted-by":"publisher","first-page":"5944","DOI":"10.1109\/TCSVT.2022.3164467","volume":"32","author":"B Li","year":"2022","unstructured":"Li, B., Zhang, W., Tian, M., Zhai, G., Wang, X.: Blindly assess quality of in-the-wild videos via quality-aware pre-training and motion perception. IEEE Trans. Circuits Syst. Video Technol. 32(9), 5944\u20135958 (2022)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"25_CR16","doi-asserted-by":"crossref","unstructured":"Li, D., Jiang, T., Jiang, M.: Quality assessment of in-the-wild videos. In: ACM International Conference on Multimedia (ACMM), pp. 2351\u20132359 (2019)","DOI":"10.1145\/3343031.3351028"},{"key":"25_CR17","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning (ICML), pp. 12888\u201312900. PMLR (2022)"},{"key":"25_CR18","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.Y., Feichtenhofer, C., Darrell, T., Xie, S.: A convnet for the 2020s. In: Proceedings of the IEEE\/CVF Conference on Computer Vision And Pattern Recognition, pp. 11976\u201311986 (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"25_CR19","doi-asserted-by":"crossref","unstructured":"Lu, Y., et al.: Aigc-vqa: A holistic perception metric for aigc video quality assessment. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6384\u20136394 (2024)","DOI":"10.1109\/CVPRW63382.2024.00640"},{"key":"25_CR20","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (ICML), pp. 8748\u20138763. PmLR (2021)"},{"key":"25_CR21","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training GANs. Adv. Neural Inf. Process. Syst. 29 (2016)"},{"key":"25_CR22","unstructured":"Singer, U., et\u00a0al.: Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)"},{"key":"25_CR23","unstructured":"Sun, S., Liang, X., Qu, B., Gao, W.: Content-rich aigc video quality assessment via intricate text alignment and motion-aware consistency. arXiv preprint arXiv:2502.04076 (2025)"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Sun, W., Min, X., Lu, W., Zhai, G.: A deep learning based no-reference quality assessment model for ugc videos. In: ACM International Conference on Multimedia (ACMMM), pp. 856\u2013865 (2022)","DOI":"10.1145\/3503161.3548329"},{"key":"25_CR25","unstructured":"Team, K.: Kolors: Effective training of diffusion model for photorealistic text-to-image synthesis. arXiv preprint (2024)"},{"issue":"4","key":"25_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3632178","volume":"20","author":"A Telili","year":"2023","unstructured":"Telili, A., Fezza, S.A., Hamidouche, W., Brachemi Meftah, H.F.: 2bivqa: Double bi-LSTM-based video quality assessment of ugc videos. ACM Trans. Multimed. Comput. Commun. Appl. 20(4), 1\u201322 (2023)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"25_CR27","first-page":"1","volume":"70","author":"M Wang","year":"2022","unstructured":"Wang, M., et al.: Quality measurement of screen images via foreground perception and background suppression. IEEE Trans. Instrum. Meas. 70, 1\u201311 (2022)","journal-title":"IEEE Trans. Instrum. Meas."},{"issue":"4","key":"25_CR28","doi-asserted-by":"publisher","first-page":"6026","DOI":"10.1109\/TII.2022.3173934","volume":"19","author":"M Wang","year":"2023","unstructured":"Wang, M., Huang, Y., Xiong, J., Xie, W.: Low-light images in-the-wild: A novel visibility perception-guided blind quality indicator. IEEE Trans. Industr. Inf. 19(4), 6026\u20136036 (2023)","journal-title":"IEEE Trans. Industr. Inf."},{"key":"25_CR29","doi-asserted-by":"crossref","unstructured":"Wang, M., Huang, Y., Zhang, J.: Blind quality assessment of night-time images via weak illumination analysis. In: IEEE International Conference on Multimedia and Expo (ICME), pp.\u00a01\u20136 (2021)","DOI":"10.1109\/ICME51207.2021.9428097"},{"key":"25_CR30","doi-asserted-by":"crossref","unstructured":"Wang, M., Xu, Z., Gong, Y., Xie, W.: S-CCR: Super-complete comparative representation for low-light image quality inference in-the-wild. In: ACM International Conference on Multimedia (ACM MM), pp. 5219\u20135227 (2022)","DOI":"10.1145\/3503161.3548083"},{"issue":"4","key":"25_CR31","doi-asserted-by":"publisher","first-page":"1665","DOI":"10.1007\/s11263-024-02239-9","volume":"133","author":"M Wang","year":"2025","unstructured":"Wang, M., Xu, Z., Xu, M., Lin, W.: Blind multimodal quality assessment of low-light images. Springer Int. J. Comput. Vision 133(4), 1665\u20131688 (2025)","journal-title":"Springer Int. J. Comput. Vision"},{"key":"25_CR32","doi-asserted-by":"publisher","first-page":"1849","DOI":"10.1109\/TIP.2025.3550005","volume":"34","author":"M Wang","year":"2025","unstructured":"Wang, M., Xu, Z., Zhang, X., Fang, Y., Lin, W.: Visual quality assessment of composite images: a compression-oriented database and measurement. IEEE Trans. Image Process. 34, 1849\u20131863 (2025)","journal-title":"IEEE Trans. Image Process."},{"key":"25_CR33","unstructured":"Wang, Y., et\u00a0al.: Internvid: A large-scale video-text dataset for multimodal understanding and generation. arXiv preprint arXiv:2307.06942 (2023)"},{"key":"25_CR34","unstructured":"Wu, C., et al.: Godiva: Generating open-domain videos from natural descriptions. arXiv preprint arXiv:2104.14806 (2021)"},{"key":"25_CR35","doi-asserted-by":"crossref","unstructured":"Wu, H., et al.: Fast-vqa: Efficient end-to-end video quality assessment with fragment sampling. In: European Conference on Computer Vision (ECCV), pp. 538\u2013554. Springer (2022)","DOI":"10.1007\/978-3-031-20068-7_31"},{"issue":"12","key":"25_CR36","doi-asserted-by":"publisher","first-page":"15185","DOI":"10.1109\/TPAMI.2023.3319332","volume":"45","author":"H Wu","year":"2023","unstructured":"Wu, H., et al.: Neighbourhood representative sampling for efficient end-to-end video quality assessment. IEEE Trans. Pattern Anal. Mach. Intell. 45(12), 15185\u201315202 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"9","key":"25_CR37","doi-asserted-by":"publisher","first-page":"4840","DOI":"10.1109\/TCSVT.2023.3249741","volume":"33","author":"H Wu","year":"2023","unstructured":"Wu, H., et al.: Discovqa: Temporal distortion-content transformers for video quality assessment. IEEE Trans. Circuits Syst. Video Technol. 33(9), 4840\u20134854 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"25_CR38","doi-asserted-by":"crossref","unstructured":"Wu, H., et al.: Exploring video quality assessment on user generated contents from aesthetic and technical perspectives. In: IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 20144\u201320154 (2023)","DOI":"10.1109\/ICCV51070.2023.01843"},{"key":"25_CR39","unstructured":"Wu, H., et\u00a0al.: Q-align: Teaching LMMs for visual scoring via discrete text-defined levels. arXiv preprint arXiv:2312.17090 (2023)"},{"key":"25_CR40","unstructured":"Wu, X., et al.: Human preference score v2: A solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint arXiv:2306.09341 (2023)"},{"key":"25_CR41","doi-asserted-by":"crossref","unstructured":"Xie, W., Bian, T., Wang, M.: kgMBQA: Quality Knowledge Graph-driven Multimodal Blind Image Assessment. International Joint Conference on Artificial Intelligence (IJCAI), pp.\u00a01\u20139 (2025)","DOI":"10.24963\/ijcai.2025\/473"},{"key":"25_CR42","doi-asserted-by":"publisher","first-page":"2250","DOI":"10.1109\/LSP.2024.3452556","volume":"31","author":"W Xie","year":"2024","unstructured":"Xie, W., Liu, Y., Wang, K., Wang, M.: LLM-guided cross-modal point cloud quality assessment: A graph learning approach. IEEE Signal Process. Lett. 31, 2250\u20132254 (2024)","journal-title":"IEEE Signal Process. Lett."},{"key":"25_CR43","doi-asserted-by":"crossref","unstructured":"Xie, W., Wang, K., Ju, Y., Wang, M.: pmbqa: Projection-based blind point cloud quality assessment via multimodal learning. In: ACM International Conference on Multimedia (ACM MM), pp. 3250\u20133258 (2023)","DOI":"10.1145\/3581783.3611998"},{"key":"25_CR44","first-page":"15903","volume":"36","author":"J Xu","year":"2023","unstructured":"Xu, J., et al.: Imagereward: Learning and evaluating human preferences for text-to-image generation. Adv. Neural. Inf. Process. Syst. 36, 15903\u201315935 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"25_CR45","unstructured":"Zhang, Z., et\u00a0al.: Benchmarking aigc video quality assessment: A dataset and unified model. arXiv preprint arXiv:2407.21408 (2024)"},{"key":"25_CR46","unstructured":"Zheng, Q., et\u00a0al.: Video quality assessment: A comprehensive survey. arXiv preprint arXiv:2412.04508 (2024)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/2.zoppoz.workers.dev:443\/https\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5740-0_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T12:23:56Z","timestamp":1768220636000},"score":1,"resource":{"primary":{"URL":"https:\/\/2.zoppoz.workers.dev:443\/https\/link.springer.com\/10.1007\/978-981-95-5740-0_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819557394","9789819557400"],"references-count":46,"URL":"https:\/\/2.zoppoz.workers.dev:443\/https\/doi.org\/10.1007\/978-981-95-5740-0_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"13 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2.zoppoz.workers.dev:443\/http\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}