cccjc commited on
Commit
8c04f42
1 Parent(s): 6499078

Add task count into table column name

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. utils.py +35 -13
app.py CHANGED
@@ -55,8 +55,8 @@ with gr.Blocks() as block:
55
  )
56
 
57
  # Define different captions for each table
58
- default_caption = "**Table 1: MEGA-Bench full results.** <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
59
- core_single_image_caption = "**Table 2: MEGA-Bench Core Single-image results.** <br> This subset contains 273 single-image tasks from the Core set of the benchmark. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. Compared to the default table, some models with only single-image support are added."
60
 
61
  caption_component = gr.Markdown(
62
  value=default_caption,
 
55
  )
56
 
57
  # Define different captions for each table
58
+ default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
59
+ core_single_image_caption = "**Table 2: MEGA-Bench Core Single-image results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set of the benchmark. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
60
 
61
  caption_component = gr.Markdown(
62
  value=default_caption,
utils.py CHANGED
@@ -102,13 +102,35 @@ class BaseDataLoader:
102
  self.MODEL_GROUPS = self._initialize_model_groups()
103
 
104
  def _initialize_super_groups(self):
105
- # Define the desired order of super groups
 
106
 
107
- groups = {DIMENSION_NAME_MAP[dim]: [KEYWORD_NAME_MAP.get(k, k) for k in self.MODEL_DATA[next(iter(self.MODEL_DATA))][dim].keys()]
108
- for dim in self.MODEL_DATA[next(iter(self.MODEL_DATA))]}
 
109
 
110
- order = ["Skills", "Application", "Output Format", "Input Format", "Visual Input Number"]
111
- # Sort the dictionary based on the predefined order
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  return {k: groups[k] for k in order if k in groups}
113
 
114
  def _initialize_model_groups(self) -> Dict[str, list]:
@@ -167,12 +189,12 @@ class DefaultDataLoader(BaseDataLoader):
167
  "Core(w/ CoT)": round(core_cot_score * 100, 2),
168
  "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
169
  }
170
- for keyword in self.SUPER_GROUPS[selected_super_group]:
171
- original_keyword = get_original_keyword(keyword)
172
  if original_dimension in model_data and original_keyword in model_data[original_dimension]:
173
- row[keyword] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
174
  else:
175
- row[keyword] = None
176
  data.append(row)
177
 
178
  df = pd.DataFrame(data)
@@ -209,12 +231,12 @@ class CoreSingleDataLoader(BaseDataLoader):
209
  "Models": get_display_model_name(model),
210
  "Core SI": round(core_si_score * 100, 2),
211
  }
212
- for keyword in self.SUPER_GROUPS[selected_super_group]:
213
- original_keyword = get_original_keyword(keyword)
214
  if original_dimension in model_data and original_keyword in model_data[original_dimension]:
215
- row[keyword] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
216
  else:
217
- row[keyword] = None
218
  data.append(row)
219
 
220
  df = pd.DataFrame(data)
 
102
  self.MODEL_GROUPS = self._initialize_model_groups()
103
 
104
  def _initialize_super_groups(self):
105
+ # Get a sample model to access the structure
106
+ sample_model = next(iter(self.MODEL_DATA))
107
 
108
+ # Create groups with task counts
109
+ groups = {}
110
+ self.keyword_display_map = {} # Add this map to store display-to-original mapping
111
 
112
+ for dim in self.MODEL_DATA[sample_model]:
113
+ dim_name = DIMENSION_NAME_MAP[dim]
114
+ # Create a list of tuples (display_name, count, keyword) for sorting
115
+ keyword_info = []
116
+
117
+ for keyword in self.MODEL_DATA[sample_model][dim]:
118
+ # Get the task count for this keyword
119
+ task_count = self.MODEL_DATA[sample_model][dim][keyword]["count"]
120
+ original_name = KEYWORD_NAME_MAP.get(keyword, keyword)
121
+ display_name = f"{original_name}({task_count})"
122
+ keyword_info.append((display_name, task_count, keyword))
123
+
124
+ # Sort by count (descending) and then by display name (for ties)
125
+ keyword_info.sort(key=lambda x: (-x[1], x[0]))
126
+
127
+ # Store sorted display names and update mapping
128
+ groups[dim_name] = [info[0] for info in keyword_info]
129
+ for display_name, _, keyword in keyword_info:
130
+ self.keyword_display_map[display_name] = keyword
131
+
132
+ # Sort based on predefined order
133
+ order = ["Application", "Skills", "Output Format", "Input Format", "Visual Input Number"]
134
  return {k: groups[k] for k in order if k in groups}
135
 
136
  def _initialize_model_groups(self) -> Dict[str, list]:
 
189
  "Core(w/ CoT)": round(core_cot_score * 100, 2),
190
  "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
191
  }
192
+ for display_name in self.SUPER_GROUPS[selected_super_group]:
193
+ original_keyword = self.keyword_display_map[display_name]
194
  if original_dimension in model_data and original_keyword in model_data[original_dimension]:
195
+ row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
196
  else:
197
+ row[display_name] = None
198
  data.append(row)
199
 
200
  df = pd.DataFrame(data)
 
231
  "Models": get_display_model_name(model),
232
  "Core SI": round(core_si_score * 100, 2),
233
  }
234
+ for display_name in self.SUPER_GROUPS[selected_super_group]:
235
+ original_keyword = self.keyword_display_map[display_name]
236
  if original_dimension in model_data and original_keyword in model_data[original_dimension]:
237
+ row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
238
  else:
239
+ row[display_name] = None
240
  data.append(row)
241
 
242
  df = pd.DataFrame(data)