{"id":"https://openalex.org/W4402716131","doi":"https://doi.org/10.1109/cvpr52733.2024.01261","title":"Synthesize, Diagnose, and Optimize: Towards Fine-Grained Vision-Language Understanding","display_name":"Synthesize, Diagnose, and Optimize: Towards Fine-Grained Vision-Language Understanding","publication_year":2024,"publication_date":"2024-06-16","ids":{"openalex":"https://openalex.org/W4402716131","doi":"https://doi.org/10.1109/cvpr52733.2024.01261"},"language":"en","primary_location":{"id":"doi:10.1109/cvpr52733.2024.01261","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52733.2024.01261","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101798999","display_name":"Wujian Peng","orcid":"https://orcid.org/0009-0001-6428-276X"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wujian Peng","raw_affiliation_strings":["School of CS, Fudan University,Shanghai Key Lab of Intell. Info. Processing"],"affiliations":[{"raw_affiliation_string":"School of CS, Fudan University,Shanghai Key Lab of Intell. Info. Processing","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061628503","display_name":"Sicheng Xie","orcid":null},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sicheng Xie","raw_affiliation_strings":["School of CS, Fudan University,Shanghai Key Lab of Intell. Info. Processing"],"affiliations":[{"raw_affiliation_string":"School of CS, Fudan University,Shanghai Key Lab of Intell. Info. Processing","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100580466","display_name":"Zuyao You","orcid":null},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zuyao You","raw_affiliation_strings":["School of CS, Fudan University,Shanghai Key Lab of Intell. Info. Processing"],"affiliations":[{"raw_affiliation_string":"School of CS, Fudan University,Shanghai Key Lab of Intell. Info. Processing","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008361912","display_name":"Shiyi Lan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shiyi Lan","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5026167547","display_name":"Zuxuan Wu","orcid":"https://orcid.org/0000-0002-8689-5807"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zuxuan Wu","raw_affiliation_strings":["School of CS, Fudan University,Shanghai Key Lab of Intell. Info. Processing"],"affiliations":[{"raw_affiliation_string":"School of CS, Fudan University,Shanghai Key Lab of Intell. Info. Processing","institution_ids":["https://openalex.org/I24943067"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101798999"],"corresponding_institution_ids":["https://openalex.org/I24943067"],"apc_list":null,"apc_paid":null,"fwci":1.3121,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.81809577,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"13279","last_page":"13288"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9929999709129333,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7475968599319458},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.45509785413742065},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4374805986881256},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3361222743988037}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7475968599319458},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45509785413742065},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4374805986881256},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3361222743988037}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cvpr52733.2024.01261","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52733.2024.01261","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G6994314671","display_name":null,"funder_award_id":"62102092","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":63,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2108598243","https://openalex.org/W2185175083","https://openalex.org/W2896457183","https://openalex.org/W2904565150","https://openalex.org/W3016970897","https://openalex.org/W3103934428","https://openalex.org/W3104279398","https://openalex.org/W3126337491","https://openalex.org/W3166396011","https://openalex.org/W3172798424","https://openalex.org/W3177487519","https://openalex.org/W3209532394","https://openalex.org/W3214685499","https://openalex.org/W4224246420","https://openalex.org/W4226182655","https://openalex.org/W4229042118","https://openalex.org/W4281485151","https://openalex.org/W4281633937","https://openalex.org/W4281930370","https://openalex.org/W4296151206","https://openalex.org/W4296406182","https://openalex.org/W4306820534","https://openalex.org/W4308760226","https://openalex.org/W4310926772","https://openalex.org/W4311642023","https://openalex.org/W4312261477","https://openalex.org/W4312910992","https://openalex.org/W4312933868","https://openalex.org/W4320855685","https://openalex.org/W4324128075","https://openalex.org/W4379474533","https://openalex.org/W4381802186","https://openalex.org/W4383472654","https://openalex.org/W4386072101","https://openalex.org/W4386075494","https://openalex.org/W4386083043","https://openalex.org/W4390872526","https://openalex.org/W4390872842","https://openalex.org/W4390874575","https://openalex.org/W4391128821","https://openalex.org/W4402582789","https://openalex.org/W4402667906","https://openalex.org/W6755207826","https://openalex.org/W6790019176","https://openalex.org/W6791353385","https://openalex.org/W6802987763","https://openalex.org/W6804095316","https://openalex.org/W6810039040","https://openalex.org/W6810708803","https://openalex.org/W6811013733","https://openalex.org/W6811072154","https://openalex.org/W6838639034","https://openalex.org/W6839415613","https://openalex.org/W6842585177","https://openalex.org/W6843018836","https://openalex.org/W6846007759","https://openalex.org/W6846681330","https://openalex.org/W6849829975","https://openalex.org/W6850787431","https://openalex.org/W6853702739","https://openalex.org/W6854511533","https://openalex.org/W6857197186"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Vision":[0],"language":[1],"models":[2],"(VLM)":[3],"have":[4],"demonstrated":[5],"re-markable":[6],"performance":[7,119],"across":[8],"various":[9],"downstream":[10],"tasks.":[11],"However,":[12],"understanding":[13],"fine-grained":[14,160],"visual-linguistic":[15],"con-cepts,":[16],"such":[17],"as":[18],"attributes":[19],"and":[20,61,104,174],"inter-object":[21],"relationships,":[22],"re-mains":[23],"a":[24,59,66,75,92,109,134],"significant":[25,126,147],"challenge.":[26],"While":[27],"several":[28],"benchmarks":[29,161],"aim":[30],"to":[31,69,95,122,139],"evaluate":[32],"VLMs":[33,56,114,141],"in":[34,74,81,130,142],"finer":[35],"granularity,":[36],"their":[37,118],"primary":[38],"fo-cus":[39],"remains":[40],"on":[41,115,149,157],"the":[42,46,52,97,153,168],"linguistic":[43],"aspect,":[44],"neglecting":[45],"visual":[47,62],"dimension.":[48],"Here,":[49],"we":[50,89,107,132],"highlight":[51],"importance":[53],"of":[54,99,170],"evaluating":[55],"from":[57],"both":[58],"textual":[60],"perspective.":[63],"We":[64],"intro-duce":[65],"progressive":[67],"pipeline":[68],"synthesize":[70],"images":[71],"that":[72],"vary":[73],"specific":[76],"attribute":[77],"while":[78],"ensuring":[79],"consistency":[80],"all":[82],"other":[83],"aspects.":[84],"Utilizing":[85],"this":[86,129],"data":[87,175],"engine,":[88],"carefully":[90],"design":[91],"benchmark,":[93],"SPEC,":[94],"diagnose":[96],"comprehension":[98],"object":[100],"size,":[101],"position,":[102],"existence,":[103],"count.":[105],"Subsequently,":[106],"con-duct":[108],"thorough":[110],"evaluation":[111],"offour":[112],"leading":[113],"SPEC.":[116],"Surprisingly,":[117],"is":[120],"close":[121],"random":[123],"guess,":[124],"revealing":[125],"limitations.":[127],"With":[128],"mind,":[131],"pro-pose":[133],"simple":[135],"yet":[136],"effective":[137],"approach":[138],"optimize":[140],"fine-":[143],"grained":[144],"understanding,":[145],"achieving":[146],"improve-ments":[148],"SPEC":[150],"without":[151],"compromising":[152],"zero-shot":[154],"performance.":[155],"Results":[156],"two":[158],"additional":[159],"also":[162],"show":[163],"consistent":[164],"improvements,":[165],"further":[166],"validating":[167],"transferability":[169],"our":[171],"approach.":[172],"Code":[173],"are":[176],"available":[177],"at":[178],"https://github.com/wjpoom/SPEC.":[179]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":1}],"updated_date":"2025-12-27T23:08:20.325037","created_date":"2025-10-10T00:00:00"}
