diff --git "a/static/eval_results/all_model_keywords_stats.json" "b/static/eval_results/all_model_keywords_stats.json" new file mode 100644--- /dev/null +++ "b/static/eval_results/all_model_keywords_stats.json" @@ -0,0 +1,3746 @@ +{ + "GPT_4o_mini": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.44928744961868194 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.48842488118273475 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.5152626716886682 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.4672966076116977 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.3406008235342885 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5572281917334303 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6902380952380953 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.4189154010048976 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.2943206715105082 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.19422793560945503 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.4700389569079038 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.3624496929166193 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.38946844562183286 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.45508480503584553 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.47569921440672464 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.46468618797917643 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.29410984789062117 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.41174000979649644 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.38893151244736324 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.44244772638735347 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.3629944944697668 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5713834131825314 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.39874839531459466 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.3359977324263039 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.4260710116168476 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.46322170353087255 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.45508480503584553 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.24651576711552803 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.3697506340557095 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.5640948591986592 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.2420320329702607 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.3458483931206892 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.43544861040322835 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5176671720617656 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.3554299482098288 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5398829253460956 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.32918280841495845 + } + } + }, + "Llama_3_2_11B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.1907604552173455 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.14280015951776653 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.1960311445935766 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.22399113135844315 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.13303760019716085 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.323153603297999 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4260501253132832 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.1770852858056774 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.15366454315378308 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.06563884729522687 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.11886347847341794 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.11489351406848371 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.1693681214060816 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2123769209846321 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.2520175802062012 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.24806929522702081 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.06418655520777307 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.12349256529641485 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.16374180545556977 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.1576236804437753 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.15014439824913947 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.3003142292328822 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.19270157739425633 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.1463246409674981 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.0732004839476103 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.19579907898674231 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2123769209846321 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.1351857051327849 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.18586695387250338 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.17288724679416761 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.08100042975820579 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.0575426944971537 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.19853488174071646 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.254316961351997 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.162801811963855 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.28055776664538923 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.13937853323074623 + } + } + }, + "InternVL2_8B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.2817247716997634 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.2794121858805306 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2511, + "tasks": [], + "average_score": 0.31918687243853283 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2469, + "tasks": [], + "average_score": 0.325593535916075 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.24118253695139918 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.39684007367798446 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4700852130325815 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.27052668526005397 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2439, + "tasks": [], + "average_score": 0.23189345356483618 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.08260405712900723 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.2277532691786533 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.2013779290163996 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.2804429603269583 + }, + "Videos": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.34791358240562653 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.2942163420306113 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.33787327172644077 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.10933317885944857 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.24944408255581693 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.25203287826995174 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.27414636444623874 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.22381302045502052 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1456, + "tasks": [], + "average_score": 0.3537549824897016 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.30261189962428353 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.15434618291761149 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.19814032315010577 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.30046383040641306 + }, + "video": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.34791358240562653 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.17725087609332119 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2532272454839157 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.29096771640715396 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.12166926715781588 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.24700310231619527 + }, + "Perception": { + "count": 145, + "num_samples": 2315, + "tasks": [], + "average_score": 0.3205471121079154 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3995660275981844 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.24614711281861912 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.3393895915929317 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.22078333222564453 + } + } + }, + "llava_onevision_7B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.2524786809911341 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.1902376706945491 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.255069390206439 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.29981286990552625 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.18973491465938852 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.36842322314565323 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.44998746867167916 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.2445135206648208 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.21802943568344288 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.06658775725427067 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.1466163383815089 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.13297395577964055 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.24236719143449742 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.30985943541023103 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.3199731020402028 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.3258716730180874 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.13043163858789789 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.20209776978059824 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.18285692568564196 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.25384794412815426 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.2200472229099345 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.3127341248874411 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.2802999516721972 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.1476473922902494 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.13787962981142515 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.25459683619676365 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.30985943541023103 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.1778991941079372 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2410111891690358 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.19274192395698486 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.09846926279075068 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.15189414475467605 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.2845922887108415 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3600079950628582 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.23654776813656775 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.3271805711561501 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.22080546908673507 + } + } + }, + "llava_onevision_72B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.3615741356043519 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.282401662313336 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.36653344218973427 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.42146038539739283 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2951434804409883 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.478119286755779 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6005438596491229 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.31663222188988865 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.29633645022129285 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.13872280436872364 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.23294708136735856 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.2126914943750874 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.34566020099204997 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4446001874842145 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.4401364830377099 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.42429297143518147 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.23897262553543516 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.28614732096244 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.25872873777911126 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.370724080249463 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.3065719940769206 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.4293132525502993 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.3986052416087927 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.20730347694633405 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.27911174307216713 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3481968601113118 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4446001874842145 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.25013213032747944 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.34156793747875674 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.30653989171354723 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.18168666652660437 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.23240790940031927 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.38316803441883945 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.4807891958712894 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.31702495228966576 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.4358874880224115 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.31588468105075895 + } + } + }, + "Gemini_1.5_pro_002": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.5201947642961418 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.4989864259016192 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.550842111088751 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5467324805307577 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.425969084163906 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5750369536204262 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6982330827067671 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.513647745999633 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.3845337030093212 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.23899503258223884 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.4592162957187749 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4292353723689881 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.4869625906903554 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5028718355967439 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5584779204331461 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.5495643443147615 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.4292127751495457 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.44828282747008336 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.44137714463131966 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5146447350354234 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.4688623462674191 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5580414823700747 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5538255562099124 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.39066515495086923 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5295721925617263 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5032283218366624 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5028718355967439 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.4885398161821004 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.4553778359922855 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.5378983862471568 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.3335324339429373 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.43465181771633377 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.524603412718188 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5821004797173627 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.5124355410095621 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5721991184410764 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.41210885517904977 + } + } + }, + "MiniCPM_v2.6": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.2604969133146555 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.24828453993935928 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.2987613496312298 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.31808788094038193 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.18281637763548025 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.4073231792632807 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.48798245614035085 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.23723675736151562 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.1968926733821904 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.08735883237069725 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.21153173491931837 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.18639148159043903 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.21578309681746147 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.3527537836840162 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.3096882575625531 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.31628986040092516 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.0755920550038197 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.23302306387939006 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.17775369699584467 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.2551275278138797 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.20833171754655547 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.36473950920880716 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.293386806641223 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.13955971277399848 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.23499726844115643 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2625611181730622 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.3527537836840162 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.17888270664238365 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.22288678972853282 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.26614948589295767 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.11693267119342445 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.15342045420318667 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.2910511308735813 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3777897246686755 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.25714862989687987 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.33187792895542906 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.16493399805627715 + } + } + }, + "GPT_4o": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.5628292541089482 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.6173690896799526 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.6122177959113034 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5822888182775097 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.44177544539510955 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.6344814691282928 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6795263157894738 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.5514924675940659 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.39435038953269674 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.22934807257231926 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.6046575685772053 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.491325251564869 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.4999089647103332 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5315979872161023 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5641404607063637 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.559466820210236 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.47760591698367955 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.5354190939719853 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.4780999465727382 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5994159671881645 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.44606605087301393 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.6274371950293718 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5448877153826162 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.4751133786848073 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5265640970967286 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5664191419997976 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5315979872161023 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.4500928191484624 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.490800991115688 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.7011776751799048 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.33202130899313653 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.5032849161169843 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5491960044393517 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.6095778863474799 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.5283797185155754 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.6135384893140922 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.44047720383044436 + } + } + }, + "Phi-3.5-vision": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.2551037902226636 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.24734930136620975 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.2864612416413776 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.3049602749093698 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.21653804346780042 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.36823084724842464 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.46663157894736845 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.24145330077248778 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.2154692063816354 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.08944481289041872 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.18587661796707747 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.17497379027990792 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.26053460127801603 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.24669318645450836 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.2786226802221388 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.34091066308972107 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.15444746077692828 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.21711219915973207 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.2138304528863496 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.2572371188897671 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.21409351002477045 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.365192668303297 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.25960269434727634 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.12546296296296297 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.14174374624685185 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2776898347355035 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.24669318645450836 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.20168001345379397 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2850550871176333 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.22277777000798116 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.08928724806836039 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.219367263034246 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.31585879714366544 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3945898792928062 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.21925278489551242 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.33264696401038385 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.17575913004138646 + } + } + }, + "InternVL2_76B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.38191947207402666 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.4103649605406274 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.4341802504488193 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.42654142415639185 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2975890791763991 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5257357753421337 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.5779473684210527 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.33287081421166276 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.2949505390920417 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.17036496432397477 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.362195416198664 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.31396468806559114 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.3473756113126343 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.395893002855977 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.44982107744035305 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.42686510293379315 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.2868239162778749 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3603288661353782 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.3465926907358438 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.3943337471922549 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.29244088978470345 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.45822072478616577 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.3879326330400817 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.20309901738473166 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.34490184941501867 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.41372274360003347 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.395893002855977 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.24403942809507134 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.3152784738582855 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.4290949563510903 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.2132321995754061 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.2953329718984368 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.4201902630957567 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.47409276729986083 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.30014798153766264 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.46253164682269177 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.2868813944130515 + } + } + }, + "Gemini_1.5_flash_002": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.46250942866818673 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.4317914359988347 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.49775198805427967 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5098686082319499 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.34393279682972117 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5594391803821158 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6380250626566416 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.44816564352475535 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.34510790215980036 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.18973764406890803 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.3836737169374586 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.3598139859097534 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.4013870708864889 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4903530871753026 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5051202896842343 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.5166044655846657 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.3849084036535956 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3869438864407766 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.3962715194192418 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.44793686445264996 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.3704146726364947 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5448638967636353 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.47829883834573317 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.33669690098261523 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.4300676062024303 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4427944359714585 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4903530871753026 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.42346517633403413 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.41994719346489817 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.4627701625196691 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.2517485212411566 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.40372378342017806 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.4799408254775632 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.6010361821632402 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.4569546533897065 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.511590428993871 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.33710867194177685 + } + } + }, + "Pixtral_12B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.34602671066871027 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.3764652079852679 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.38183869685317606 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.3776679463596073 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2828575553466608 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.4190587833823822 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.5687919799498747 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.32813540763467464 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.2677293131171651 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.10591240329992047 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.30581019415764066 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.28832738144368647 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.3223299098375932 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.409643099998057 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.37450808136321684 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.37068890840142343 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.24009431093278263 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3071379066920702 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.31782992537086313 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.3639544140938305 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.32073418701669026 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.4166613092238043 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.3008126415966517 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.19743008314436883 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.16370884074367903 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.37086966536142313 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.409643099998057 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.2575699315401612 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.310449170121381 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.4285286292013588 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.13622980866275425 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.2572414987500377 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.388749951743596 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5020540387409291 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.31301986568151985 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.38094471423409354 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.24222628640267738 + } + } + }, + "Claude_3.5": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.5405089647404562 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.6026892335040172 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.5695311134746034 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5450038475783499 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.4767692987630454 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5756126284078804 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6969774436090224 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.5278843049497918 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.4082144793870471 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.23803578664609892 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.5637906302497772 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4795267886975966 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.525848282456283 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.508735695828719 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5699094130430454 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.5078124682977725 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.4429640420975014 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.5039586533964282 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.4926030136534706 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5278127103234661 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.4490020843308984 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5838224169821388 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5456152399978661 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.46300075585789874 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5292494759360522 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5364554303845326 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.508735695828719 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.4422556748863689 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.49311554035078103 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.6593763006847053 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.3382015835012861 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.5194010220575684 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5304907166726288 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5808831682303479 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.513474611293123 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5507075880782885 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.47461998432626556 + } + } + }, + "Idefics3": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.14507788965553362 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.11641535161320743 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.17255583910766542 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.14745217246476708 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.1331851390883708 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.19221534222332276 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.28640852130325817 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.17906399043310475 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.10192930055370109 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.04211916597550756 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.10126271262360581 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.11407926733108291 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.16225217317782772 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.16181866973635636 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.1839408679813373 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.14933801491626408 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.0395540896656236 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.13979628998424784 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.1062779093260333 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.07053056796593082 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.09790172378722654 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.2987797010800956 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.11588163814170001 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.1008692365835223 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.09308121224497533 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.14757589734485796 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.16181866973635636 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.12217834249866026 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.12276246278377517 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.14743542163139847 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.05354869594691955 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.09065540194572455 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.1463280929280822 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.14564374862578883 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.22748773785486257 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.17647756032677067 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.13168972973651977 + } + } + }, + "Qwen2_VL_7B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.370836862933556 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.39973692484032347 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2511, + "tasks": [], + "average_score": 0.4012977216731433 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2469, + "tasks": [], + "average_score": 0.410990923097227 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2818925976996871 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.493608784197707 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.5215889724310777 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.33309401517140946 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2439, + "tasks": [], + "average_score": 0.27564756843599875 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.1473690605854188 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.3814353882556586 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.2896392967775049 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.3223325179806271 + }, + "Videos": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.4111189310485516 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.34825121621909577 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.4047366473438155 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.262166593895899 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3403519326516044 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.3420538306638288 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.35162604166912687 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.32665673520415817 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1456, + "tasks": [], + "average_score": 0.3909745200389741 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.39898011714302023 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.19415154950869234 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.37301502633138073 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3761693199448087 + }, + "video": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.4111189310485516 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.26429868057315387 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.33008667137716374 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.42660307298355216 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.2003871750665659 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.3270187644950453 + }, + "Perception": { + "count": 145, + "num_samples": 2315, + "tasks": [], + "average_score": 0.39864841947520724 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.4245693009859056 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.29880557491654197 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.42766370932167636 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.25562039051316643 + } + } + }, + "Qwen2_VL_72B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.49774395003470484 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.538829507114716 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.534480883952292 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5092565754998357 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.3776739609562984 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5676174603436022 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.60496992481203 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.4633019068994453 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.35105970797600183 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.2201150812944581 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.5356361790015363 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4289777675393297 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.42094543671351287 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.49943888306036405 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.507967430369507 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.495761900914191 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.36212605501536715 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.4444770652190341 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.44584364394901616 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5098505660529429 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.4027115384266939 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5157810622684265 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5199940976484408 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.3100812547241119 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5364299983756791 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4908605783408196 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.49943888306036405 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.36691704884033916 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.45169664275718613 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.5748195752273694 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.31245958897213383 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.4372517645050852 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5343715685033166 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.4968249101570037 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.4488852456563113 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5162919233645259 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.31157492395100744 + } + } + } +} \ No newline at end of file