Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
kennymckormick
commited on
Commit
•
64d336c
1
Parent(s):
a6e43e6
add OCRBench
Browse files- gen_table.py +5 -0
- meta_data.py +8 -1
gen_table.py
CHANGED
@@ -78,6 +78,8 @@ def BUILD_L1_DF(results, fields):
|
|
78 |
res[d].append(item[d]['Overall'])
|
79 |
if d == 'MME':
|
80 |
scores.append(item[d]['Overall'] / 28)
|
|
|
|
|
81 |
else:
|
82 |
scores.append(item[d]['Overall'])
|
83 |
ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values()]))
|
@@ -106,6 +108,9 @@ def BUILD_L2_DF(results, dataset):
|
|
106 |
if dataset == 'MME':
|
107 |
non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
|
108 |
overall_fields = overall_fields + ['Perception', 'Cognition']
|
|
|
|
|
|
|
109 |
|
110 |
for m in results:
|
111 |
item = results[m]
|
|
|
78 |
res[d].append(item[d]['Overall'])
|
79 |
if d == 'MME':
|
80 |
scores.append(item[d]['Overall'] / 28)
|
81 |
+
elif d == 'OCRBench':
|
82 |
+
scores.append(item[d]['Final Score'] / 10)
|
83 |
else:
|
84 |
scores.append(item[d]['Overall'])
|
85 |
ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values()]))
|
|
|
108 |
if dataset == 'MME':
|
109 |
non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
|
110 |
overall_fields = overall_fields + ['Perception', 'Cognition']
|
111 |
+
if dataset == 'OCRBench':
|
112 |
+
non_overall_fields = [x for x in non_overall_fields if not listinstr(['Final Score'], x)]
|
113 |
+
overall_fields = ['Final Score']
|
114 |
|
115 |
for m in results:
|
116 |
item = results[m]
|
meta_data.py
CHANGED
@@ -124,4 +124,11 @@ LEADERBOARD_MD['ScienceQA_VAL'] = """
|
|
124 |
- During evaluation, we use `GPT-3.5-Turbo-0613` as the choice extractor for all VLMs if the choice can not be extracted via heuristic matching. **Zero-shot** inference is adopted.
|
125 |
"""
|
126 |
|
127 |
-
LEADERBOARD_MD['ScienceQA_TEST'] = LEADERBOARD_MD['ScienceQA_VAL']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
- During evaluation, we use `GPT-3.5-Turbo-0613` as the choice extractor for all VLMs if the choice can not be extracted via heuristic matching. **Zero-shot** inference is adopted.
|
125 |
"""
|
126 |
|
127 |
+
LEADERBOARD_MD['ScienceQA_TEST'] = LEADERBOARD_MD['ScienceQA_VAL']
|
128 |
+
|
129 |
+
LEADERBOARD_MD['OCRBench'] = """
|
130 |
+
## OCRBench Evaluation Results
|
131 |
+
|
132 |
+
- The evaluation of OCRBench is implemented by the official team: https://github.com/Yuliang-Liu/MultimodalOCR.
|
133 |
+
- The performance of GPT4V might be underestimated: GPT4V rejects to answer 12 percent of the questions due to the policy of OpenAI. For those questions, the returned answer is "Your input image may contain content that is not allowed by our safety system."
|
134 |
+
"""
|