Spaces:
Running
Running
Add task count into table column name
Browse files
app.py
CHANGED
@@ -55,8 +55,8 @@ with gr.Blocks() as block:
|
|
55 |
)
|
56 |
|
57 |
# Define different captions for each table
|
58 |
-
default_caption = "**Table 1: MEGA-Bench full results.** <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
|
59 |
-
core_single_image_caption = "**Table 2: MEGA-Bench Core Single-image results.** <br> This subset contains 273 single-image tasks from the Core set of the benchmark. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. Compared to the default table, some models with only single-image support are added."
|
60 |
|
61 |
caption_component = gr.Markdown(
|
62 |
value=default_caption,
|
|
|
55 |
)
|
56 |
|
57 |
# Define different captions for each table
|
58 |
+
default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
|
59 |
+
core_single_image_caption = "**Table 2: MEGA-Bench Core Single-image results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set of the benchmark. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
|
60 |
|
61 |
caption_component = gr.Markdown(
|
62 |
value=default_caption,
|
utils.py
CHANGED
@@ -102,13 +102,35 @@ class BaseDataLoader:
|
|
102 |
self.MODEL_GROUPS = self._initialize_model_groups()
|
103 |
|
104 |
def _initialize_super_groups(self):
|
105 |
-
#
|
|
|
106 |
|
107 |
-
|
108 |
-
|
|
|
109 |
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
return {k: groups[k] for k in order if k in groups}
|
113 |
|
114 |
def _initialize_model_groups(self) -> Dict[str, list]:
|
@@ -167,12 +189,12 @@ class DefaultDataLoader(BaseDataLoader):
|
|
167 |
"Core(w/ CoT)": round(core_cot_score * 100, 2),
|
168 |
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
|
169 |
}
|
170 |
-
for
|
171 |
-
original_keyword =
|
172 |
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
|
173 |
-
row[
|
174 |
else:
|
175 |
-
row[
|
176 |
data.append(row)
|
177 |
|
178 |
df = pd.DataFrame(data)
|
@@ -209,12 +231,12 @@ class CoreSingleDataLoader(BaseDataLoader):
|
|
209 |
"Models": get_display_model_name(model),
|
210 |
"Core SI": round(core_si_score * 100, 2),
|
211 |
}
|
212 |
-
for
|
213 |
-
original_keyword =
|
214 |
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
|
215 |
-
row[
|
216 |
else:
|
217 |
-
row[
|
218 |
data.append(row)
|
219 |
|
220 |
df = pd.DataFrame(data)
|
|
|
102 |
self.MODEL_GROUPS = self._initialize_model_groups()
|
103 |
|
104 |
def _initialize_super_groups(self):
|
105 |
+
# Get a sample model to access the structure
|
106 |
+
sample_model = next(iter(self.MODEL_DATA))
|
107 |
|
108 |
+
# Create groups with task counts
|
109 |
+
groups = {}
|
110 |
+
self.keyword_display_map = {} # Add this map to store display-to-original mapping
|
111 |
|
112 |
+
for dim in self.MODEL_DATA[sample_model]:
|
113 |
+
dim_name = DIMENSION_NAME_MAP[dim]
|
114 |
+
# Create a list of tuples (display_name, count, keyword) for sorting
|
115 |
+
keyword_info = []
|
116 |
+
|
117 |
+
for keyword in self.MODEL_DATA[sample_model][dim]:
|
118 |
+
# Get the task count for this keyword
|
119 |
+
task_count = self.MODEL_DATA[sample_model][dim][keyword]["count"]
|
120 |
+
original_name = KEYWORD_NAME_MAP.get(keyword, keyword)
|
121 |
+
display_name = f"{original_name}({task_count})"
|
122 |
+
keyword_info.append((display_name, task_count, keyword))
|
123 |
+
|
124 |
+
# Sort by count (descending) and then by display name (for ties)
|
125 |
+
keyword_info.sort(key=lambda x: (-x[1], x[0]))
|
126 |
+
|
127 |
+
# Store sorted display names and update mapping
|
128 |
+
groups[dim_name] = [info[0] for info in keyword_info]
|
129 |
+
for display_name, _, keyword in keyword_info:
|
130 |
+
self.keyword_display_map[display_name] = keyword
|
131 |
+
|
132 |
+
# Sort based on predefined order
|
133 |
+
order = ["Application", "Skills", "Output Format", "Input Format", "Visual Input Number"]
|
134 |
return {k: groups[k] for k in order if k in groups}
|
135 |
|
136 |
def _initialize_model_groups(self) -> Dict[str, list]:
|
|
|
189 |
"Core(w/ CoT)": round(core_cot_score * 100, 2),
|
190 |
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
|
191 |
}
|
192 |
+
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
193 |
+
original_keyword = self.keyword_display_map[display_name]
|
194 |
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
|
195 |
+
row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
|
196 |
else:
|
197 |
+
row[display_name] = None
|
198 |
data.append(row)
|
199 |
|
200 |
df = pd.DataFrame(data)
|
|
|
231 |
"Models": get_display_model_name(model),
|
232 |
"Core SI": round(core_si_score * 100, 2),
|
233 |
}
|
234 |
+
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
235 |
+
original_keyword = self.keyword_display_map[display_name]
|
236 |
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
|
237 |
+
row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
|
238 |
else:
|
239 |
+
row[display_name] = None
|
240 |
data.append(row)
|
241 |
|
242 |
df = pd.DataFrame(data)
|