Spaces:
Running
Running
dump from mock space
Browse files- .gitattributes +1 -1
- .gitignore +226 -11
- README.md +3 -36
- app.py +66 -191
- constants.py +81 -0
- requirements.txt +16 -14
- static/css/style.css +62 -0
- static/eval_results/all_model_keywords_stats.json +0 -0
- static/eval_results/all_summary.json +418 -0
- utils.py +148 -0
.gitattributes
CHANGED
@@ -25,6 +25,7 @@
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
@@ -32,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
-
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
.gitignore
CHANGED
@@ -1,13 +1,228 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
__pycache__/
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
.ipynb_checkpoints
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos
|
2 |
+
# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,macos
|
3 |
+
|
4 |
+
### macOS ###
|
5 |
+
# General
|
6 |
+
.DS_Store
|
7 |
+
.AppleDouble
|
8 |
+
.LSOverride
|
9 |
+
|
10 |
+
# Icon must end with two \r
|
11 |
+
Icon
|
12 |
+
|
13 |
+
|
14 |
+
# Thumbnails
|
15 |
+
._*
|
16 |
+
|
17 |
+
# Files that might appear in the root of a volume
|
18 |
+
.DocumentRevisions-V100
|
19 |
+
.fseventsd
|
20 |
+
.Spotlight-V100
|
21 |
+
.TemporaryItems
|
22 |
+
.Trashes
|
23 |
+
.VolumeIcon.icns
|
24 |
+
.com.apple.timemachine.donotpresent
|
25 |
+
|
26 |
+
# Directories potentially created on remote AFP share
|
27 |
+
.AppleDB
|
28 |
+
.AppleDesktop
|
29 |
+
Network Trash Folder
|
30 |
+
Temporary Items
|
31 |
+
.apdisk
|
32 |
+
|
33 |
+
### macOS Patch ###
|
34 |
+
# iCloud generated files
|
35 |
+
*.icloud
|
36 |
+
|
37 |
+
### Python ###
|
38 |
+
# Byte-compiled / optimized / DLL files
|
39 |
__pycache__/
|
40 |
+
*.py[cod]
|
41 |
+
*$py.class
|
42 |
+
|
43 |
+
# C extensions
|
44 |
+
*.so
|
45 |
+
|
46 |
+
# Distribution / packaging
|
47 |
+
.Python
|
48 |
+
build/
|
49 |
+
develop-eggs/
|
50 |
+
dist/
|
51 |
+
downloads/
|
52 |
+
eggs/
|
53 |
+
.eggs/
|
54 |
+
lib/
|
55 |
+
lib64/
|
56 |
+
parts/
|
57 |
+
sdist/
|
58 |
+
var/
|
59 |
+
wheels/
|
60 |
+
share/python-wheels/
|
61 |
+
*.egg-info/
|
62 |
+
.installed.cfg
|
63 |
+
*.egg
|
64 |
+
MANIFEST
|
65 |
+
|
66 |
+
# PyInstaller
|
67 |
+
# Usually these files are written by a python script from a template
|
68 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
69 |
+
*.manifest
|
70 |
+
*.spec
|
71 |
+
|
72 |
+
# Installer logs
|
73 |
+
pip-log.txt
|
74 |
+
pip-delete-this-directory.txt
|
75 |
+
|
76 |
+
# Unit test / coverage reports
|
77 |
+
htmlcov/
|
78 |
+
.tox/
|
79 |
+
.nox/
|
80 |
+
.coverage
|
81 |
+
.coverage.*
|
82 |
+
.cache
|
83 |
+
nosetests.xml
|
84 |
+
coverage.xml
|
85 |
+
*.cover
|
86 |
+
*.py,cover
|
87 |
+
.hypothesis/
|
88 |
+
.pytest_cache/
|
89 |
+
cover/
|
90 |
+
|
91 |
+
# Translations
|
92 |
+
*.mo
|
93 |
+
*.pot
|
94 |
+
|
95 |
+
# Django stuff:
|
96 |
+
*.log
|
97 |
+
local_settings.py
|
98 |
+
db.sqlite3
|
99 |
+
db.sqlite3-journal
|
100 |
+
|
101 |
+
# Flask stuff:
|
102 |
+
instance/
|
103 |
+
.webassets-cache
|
104 |
+
|
105 |
+
# Scrapy stuff:
|
106 |
+
.scrapy
|
107 |
+
|
108 |
+
# Sphinx documentation
|
109 |
+
docs/_build/
|
110 |
+
|
111 |
+
# PyBuilder
|
112 |
+
.pybuilder/
|
113 |
+
target/
|
114 |
+
|
115 |
+
# Jupyter Notebook
|
116 |
.ipynb_checkpoints
|
117 |
+
|
118 |
+
# IPython
|
119 |
+
profile_default/
|
120 |
+
ipython_config.py
|
121 |
+
|
122 |
+
# pyenv
|
123 |
+
# For a library or package, you might want to ignore these files since the code is
|
124 |
+
# intended to run in multiple environments; otherwise, check them in:
|
125 |
+
# .python-version
|
126 |
+
|
127 |
+
# pipenv
|
128 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
129 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
130 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
131 |
+
# install all needed dependencies.
|
132 |
+
#Pipfile.lock
|
133 |
+
|
134 |
+
# poetry
|
135 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
136 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
137 |
+
# commonly ignored for libraries.
|
138 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
139 |
+
#poetry.lock
|
140 |
+
|
141 |
+
# pdm
|
142 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
143 |
+
#pdm.lock
|
144 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
145 |
+
# in version control.
|
146 |
+
# https://pdm.fming.dev/#use-with-ide
|
147 |
+
.pdm.toml
|
148 |
+
|
149 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
150 |
+
__pypackages__/
|
151 |
+
|
152 |
+
# Celery stuff
|
153 |
+
celerybeat-schedule
|
154 |
+
celerybeat.pid
|
155 |
+
|
156 |
+
# SageMath parsed files
|
157 |
+
*.sage.py
|
158 |
+
|
159 |
+
# Environments
|
160 |
+
.env
|
161 |
+
.venv
|
162 |
+
env/
|
163 |
+
venv/
|
164 |
+
ENV/
|
165 |
+
env.bak/
|
166 |
+
venv.bak/
|
167 |
+
|
168 |
+
# Spyder project settings
|
169 |
+
.spyderproject
|
170 |
+
.spyproject
|
171 |
+
|
172 |
+
# Rope project settings
|
173 |
+
.ropeproject
|
174 |
+
|
175 |
+
# mkdocs documentation
|
176 |
+
/site
|
177 |
+
|
178 |
+
# mypy
|
179 |
+
.mypy_cache/
|
180 |
+
.dmypy.json
|
181 |
+
dmypy.json
|
182 |
+
|
183 |
+
# Pyre type checker
|
184 |
+
.pyre/
|
185 |
+
|
186 |
+
# pytype static type analyzer
|
187 |
+
.pytype/
|
188 |
+
|
189 |
+
# Cython debug symbols
|
190 |
+
cython_debug/
|
191 |
+
|
192 |
+
# PyCharm
|
193 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
194 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
195 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
196 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
197 |
+
#.idea/
|
198 |
+
|
199 |
+
### Python Patch ###
|
200 |
+
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
201 |
+
poetry.toml
|
202 |
+
|
203 |
+
# ruff
|
204 |
+
.ruff_cache/
|
205 |
+
|
206 |
+
# LSP config files
|
207 |
+
pyrightconfig.json
|
208 |
+
|
209 |
+
### VisualStudioCode ###
|
210 |
+
.vscode/*
|
211 |
+
!.vscode/settings.json
|
212 |
+
!.vscode/tasks.json
|
213 |
+
!.vscode/launch.json
|
214 |
+
!.vscode/extensions.json
|
215 |
+
!.vscode/*.code-snippets
|
216 |
+
|
217 |
+
# Local History for Visual Studio Code
|
218 |
+
.history/
|
219 |
+
|
220 |
+
# Built Visual Studio Code Extensions
|
221 |
+
*.vsix
|
222 |
+
|
223 |
+
### VisualStudioCode Patch ###
|
224 |
+
# Ignore all local history of files
|
225 |
+
.history
|
226 |
+
.ionide
|
227 |
+
|
228 |
+
# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos
|
README.md
CHANGED
@@ -1,45 +1,12 @@
|
|
1 |
---
|
2 |
-
title: MEGA
|
3 |
emoji: 🥇
|
4 |
-
colorFrom:
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
|
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
-
short_description: The space for the leaderboard and information of MEGA-Bench
|
11 |
---
|
12 |
|
13 |
-
# Start the configuration
|
14 |
-
|
15 |
-
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
16 |
-
|
17 |
-
Results files should have the following format and be stored as json files:
|
18 |
-
```json
|
19 |
-
{
|
20 |
-
"config": {
|
21 |
-
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
22 |
-
"model_name": "path of the model on the hub: org/model",
|
23 |
-
"model_sha": "revision on the hub",
|
24 |
-
},
|
25 |
-
"results": {
|
26 |
-
"task_name": {
|
27 |
-
"metric_name": score,
|
28 |
-
},
|
29 |
-
"task_name2": {
|
30 |
-
"metric_name": score,
|
31 |
-
}
|
32 |
-
}
|
33 |
-
}
|
34 |
-
```
|
35 |
-
|
36 |
-
Request files are created automatically by this tool.
|
37 |
-
|
38 |
-
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
39 |
-
|
40 |
-
# Code logic for more complex edits
|
41 |
-
|
42 |
-
You'll find
|
43 |
-
- the main table' columns names and properties in `src/display/utils.py`
|
44 |
-
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
45 |
-
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
|
|
1 |
---
|
2 |
+
title: MEGA-Bench
|
3 |
emoji: 🥇
|
4 |
+
colorFrom: blue
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.1.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
11 |
---
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,204 +1,79 @@
|
|
1 |
import gradio as gr
|
2 |
-
from
|
3 |
-
import
|
4 |
-
from
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
)
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
ModelType,
|
23 |
-
fields,
|
24 |
-
WeightType,
|
25 |
-
Precision
|
26 |
-
)
|
27 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
-
from src.submission.submit import add_new_eval
|
30 |
-
|
31 |
-
|
32 |
-
def restart_space():
|
33 |
-
API.restart_space(repo_id=REPO_ID)
|
34 |
-
|
35 |
-
### Space initialisation
|
36 |
-
try:
|
37 |
-
print(EVAL_REQUESTS_PATH)
|
38 |
-
snapshot_download(
|
39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
-
)
|
41 |
-
except Exception:
|
42 |
-
restart_space()
|
43 |
-
try:
|
44 |
-
print(EVAL_RESULTS_PATH)
|
45 |
-
snapshot_download(
|
46 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
)
|
48 |
-
except Exception:
|
49 |
-
restart_space()
|
50 |
-
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
finished_eval_queue_df,
|
56 |
-
running_eval_queue_df,
|
57 |
-
pending_eval_queue_df,
|
58 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
-
|
60 |
-
def init_leaderboard(dataframe):
|
61 |
-
if dataframe is None or dataframe.empty:
|
62 |
-
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
-
return Leaderboard(
|
64 |
-
value=dataframe,
|
65 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
-
select_columns=SelectColumns(
|
67 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
-
label="Select Columns to Display:",
|
70 |
-
),
|
71 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
-
filter_columns=[
|
74 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
-
ColumnFilter(
|
77 |
-
AutoEvalColumn.params.name,
|
78 |
-
type="slider",
|
79 |
-
min=0.01,
|
80 |
-
max=150,
|
81 |
-
label="Select the number of parameters (B)",
|
82 |
-
),
|
83 |
-
ColumnFilter(
|
84 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
-
),
|
86 |
-
],
|
87 |
-
bool_checkboxgroup_label="Hide models",
|
88 |
-
interactive=False,
|
89 |
)
|
90 |
-
|
91 |
-
|
92 |
-
demo = gr.Blocks(css=custom_css)
|
93 |
-
with demo:
|
94 |
-
gr.HTML(TITLE)
|
95 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
-
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
with gr.Accordion(
|
111 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
-
open=False,
|
113 |
-
):
|
114 |
-
with gr.Row():
|
115 |
-
finished_eval_table = gr.components.Dataframe(
|
116 |
-
value=finished_eval_queue_df,
|
117 |
-
headers=EVAL_COLS,
|
118 |
-
datatype=EVAL_TYPES,
|
119 |
-
row_count=5,
|
120 |
-
)
|
121 |
-
with gr.Accordion(
|
122 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
-
open=False,
|
124 |
-
):
|
125 |
-
with gr.Row():
|
126 |
-
running_eval_table = gr.components.Dataframe(
|
127 |
-
value=running_eval_queue_df,
|
128 |
-
headers=EVAL_COLS,
|
129 |
-
datatype=EVAL_TYPES,
|
130 |
-
row_count=5,
|
131 |
-
)
|
132 |
|
133 |
-
with gr.Accordion(
|
134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
-
open=False,
|
136 |
-
):
|
137 |
-
with gr.Row():
|
138 |
-
pending_eval_table = gr.components.Dataframe(
|
139 |
-
value=pending_eval_queue_df,
|
140 |
-
headers=EVAL_COLS,
|
141 |
-
datatype=EVAL_TYPES,
|
142 |
-
row_count=5,
|
143 |
-
)
|
144 |
with gr.Row():
|
145 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
|
|
|
|
|
|
|
|
147 |
with gr.Row():
|
148 |
-
|
149 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
150 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
151 |
-
model_type = gr.Dropdown(
|
152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
153 |
-
label="Model type",
|
154 |
-
multiselect=False,
|
155 |
-
value=None,
|
156 |
-
interactive=True,
|
157 |
-
)
|
158 |
|
159 |
-
with gr.Column():
|
160 |
-
precision = gr.Dropdown(
|
161 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
162 |
-
label="Precision",
|
163 |
-
multiselect=False,
|
164 |
-
value="float16",
|
165 |
-
interactive=True,
|
166 |
-
)
|
167 |
-
weight_type = gr.Dropdown(
|
168 |
-
choices=[i.value.name for i in WeightType],
|
169 |
-
label="Weights type",
|
170 |
-
multiselect=False,
|
171 |
-
value="Original",
|
172 |
-
interactive=True,
|
173 |
-
)
|
174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
|
176 |
-
submit_button = gr.Button("Submit Eval")
|
177 |
-
submission_result = gr.Markdown()
|
178 |
-
submit_button.click(
|
179 |
-
add_new_eval,
|
180 |
-
[
|
181 |
-
model_name_textbox,
|
182 |
-
base_model_name_textbox,
|
183 |
-
revision_name_textbox,
|
184 |
-
precision,
|
185 |
-
weight_type,
|
186 |
-
model_type,
|
187 |
-
],
|
188 |
-
submission_result,
|
189 |
-
)
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
value=CITATION_BUTTON_TEXT,
|
195 |
-
label=CITATION_BUTTON_LABEL,
|
196 |
-
lines=20,
|
197 |
-
elem_id="citation-button",
|
198 |
-
show_copy_button=True,
|
199 |
-
)
|
200 |
|
201 |
-
scheduler = BackgroundScheduler()
|
202 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
-
scheduler.start()
|
204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from utils import get_leaderboard_data, SUPER_GROUPS, MODEL_GROUPS
|
3 |
+
import os
|
4 |
+
from constants import *
|
5 |
+
|
6 |
+
# Get the directory of the current script
|
7 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
8 |
+
|
9 |
+
# Construct the path to the CSS file
|
10 |
+
css_file = os.path.join(current_dir, "static", "css", "style.css")
|
11 |
+
|
12 |
+
# Read the CSS file
|
13 |
+
with open(css_file, "r") as f:
|
14 |
+
css = f.read()
|
15 |
+
|
16 |
+
def update_leaderboard(selected_super_group, selected_model_group):
|
17 |
+
headers, data = get_leaderboard_data(selected_super_group, selected_model_group)
|
18 |
+
return gr.Dataframe(
|
19 |
+
value=data,
|
20 |
+
headers=headers,
|
21 |
+
datatype=["str"] + ["number"] * (len(headers) - 1),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
)
|
|
|
|
|
|
|
23 |
|
24 |
+
with gr.Blocks(css=css) as block:
|
25 |
+
gr.Markdown(
|
26 |
+
LEADERBOARD_INTRODUCTION
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
29 |
+
with gr.TabItem("📊 MEGA-Bench", elem_id="qa-tab-table1", id=1):
|
30 |
+
with gr.Row():
|
31 |
+
with gr.Accordion("Citation", open=False):
|
32 |
+
citation_button = gr.Textbox(
|
33 |
+
value=CITATION_BUTTON_TEXT,
|
34 |
+
label=CITATION_BUTTON_LABEL,
|
35 |
+
elem_id="citation-button",
|
36 |
+
lines=10,
|
37 |
+
)
|
38 |
+
gr.Markdown(
|
39 |
+
TABLE_INTRODUCTION
|
40 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
with gr.Row():
|
43 |
+
super_group_selector = gr.Radio(
|
44 |
+
choices=list(SUPER_GROUPS.keys()),
|
45 |
+
label="Select a dimension to display breakdown results",
|
46 |
+
value=list(SUPER_GROUPS.keys())[0]
|
47 |
+
)
|
48 |
+
model_group_selector = gr.Radio(
|
49 |
+
choices=list(MODEL_GROUPS.keys()),
|
50 |
+
label="Select a model group",
|
51 |
+
value="All"
|
52 |
+
)
|
53 |
+
|
54 |
+
initial_headers, initial_data = get_leaderboard_data(list(SUPER_GROUPS.keys())[0], "All")
|
55 |
+
data_component = gr.Dataframe(
|
56 |
+
value=initial_data,
|
57 |
+
headers=initial_headers,
|
58 |
+
datatype=["str"] + ["number"] * (len(initial_headers) - 1),
|
59 |
+
interactive=False,
|
60 |
+
elem_classes="custom-dataframe",
|
61 |
+
)
|
62 |
+
refresh_button = gr.Button("Refresh")
|
63 |
+
refresh_button.click(fn=update_leaderboard, inputs=[super_group_selector, model_group_selector], outputs=[data_component])
|
64 |
+
super_group_selector.change(fn=update_leaderboard, inputs=[super_group_selector, model_group_selector], outputs=[data_component])
|
65 |
+
model_group_selector.change(fn=update_leaderboard, inputs=[super_group_selector, model_group_selector], outputs=[data_component])
|
66 |
|
67 |
+
with gr.TabItem("📝 Data Information", elem_id="qa-tab-table2", id=2):
|
68 |
+
gr.Markdown(DATA_INFO, elem_classes="markdown-text")
|
69 |
+
|
70 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=3):
|
71 |
with gr.Row():
|
72 |
+
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
if __name__ == "__main__":
|
77 |
+
block.launch(share=True)
|
78 |
+
#block.launch(server_name="127.0.0.1", server_port=7860)
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
|
|
|
|
|
|
|
constants.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
4 |
+
|
5 |
+
LEADERBOARD_INTRODUCTION = """# MEGA-Bench Leaderboard
|
6 |
+
|
7 |
+
## 🚀 Introduction
|
8 |
+
|
9 |
+
[MEGA-Bench](https://tiger-ai-lab.github.io/MEGA-Bench/) is a comprehensive benchmark scaling multimodal evaluation to 500+ real-world tasks!
|
10 |
+
|
11 |
+
We aim to provide cost-effective and accurate evaluation for multimodal models, covering a wide range of real-world tasks. You don't have to run models on dozens of benchmarks -- MEGA-Bench delivers a comprehensive performance report in a single benchmark.
|
12 |
+
|
13 |
+
## 🧐 Highlights of MEGA-Bench
|
14 |
+
|
15 |
+
- 505 diverse tasks evaluating multimodal models across 8 grand application types, 7 input visual formats, 6 output formats, and 10 general multimodal skills, covering single-image, multi-image, and video tasks
|
16 |
+
- Moves beyond multiple-choice questions, offering diverse output formats like numbers, code, LATEX, phrases, free-form responses, and more. We developed 45 customized metrics to accurately evaluate these diverse outputs
|
17 |
+
- Focuses on task diversity rather than repetitive examples, ensuring cost-efficient evaluation
|
18 |
+
- Provides fine-grained capability reports across application type, input/output formats, and required skills
|
19 |
+
|
20 |
+
|
21 |
+
## 🔨 Systematic Annotation Process
|
22 |
+
|
23 |
+
- Guided by an initial application-driven taxonomy tree
|
24 |
+
- 16 expert annotators contributing to a 2-round process to develop 505 tasks
|
25 |
+
- Utilizes advanced tools for task design, review, and quality control
|
26 |
+
- Ensures high-quality data through continuous refinement and balanced task distribution
|
27 |
+
|
28 |
+
|
29 |
+
## 📊🔍 Results & Takeaways from Evaluating Top Models
|
30 |
+
|
31 |
+
- GPT4o leads the benchmark, outperforming others by 3.5% over Claude3.5
|
32 |
+
- Qwen2VL stands out among open-source models, nearing flagship-level performance
|
33 |
+
- Chain-of-Thought (CoT) improves proprietary models but has limited impact on open-source models
|
34 |
+
- Efficiency models like Gemini 1.5 Flash perform well but struggle with UI and document tasks
|
35 |
+
- Many open-source models face challenges in adhering to output format instructions
|
36 |
+
|
37 |
+
## 🎯 Interactive Visualization
|
38 |
+
|
39 |
+
Visit our [project page](https://tiger-ai-lab.github.io/MEGA-Bench/) to explore the interactive task taxonomy and radar maps, offering deep insights into model capabilities across multiple dimensions. Discover a comprehensive breakdown far beyond single-score evaluations.
|
40 |
+
|
41 |
+
|
42 |
+
## 📚 More Information
|
43 |
+
|
44 |
+
- Our evaluation pipeline will soon be available on our GitHub: https://github.com/TIGER-AI-Lab/MEGA-Bench.
|
45 |
+
- Check full details of our paper at [https://arxiv.org/abs/2410.10563](https://arxiv.org/abs/2410.10563)
|
46 |
+
- Hugging Face Datasets: [https://huggingface.co/datasets/TIGER-Lab/MEGA-Bench](https://huggingface.co/datasets/TIGER-Lab/MEGA-Bench)
|
47 |
+
- GitHub: [https://github.com/TIGER-AI-Lab/MEGA-Bench](https://github.com/TIGER-AI-Lab/MEGA-Bench)
|
48 |
+
|
49 |
+
"""
|
50 |
+
|
51 |
+
TABLE_INTRODUCTION = """
|
52 |
+
"""
|
53 |
+
|
54 |
+
DATA_INFO = """
|
55 |
+
### Data Sources
|
56 |
+
The data source of MEGA-Bench tasks have three main types:
|
57 |
+
- **Purely Self-designed:** The task is designed entirely by the annotator, and the annotator looks for the image or video resources from the Internet or even using code/simulator.
|
58 |
+
- **Inspired and adapted from existing benchmarks:** The task is inspired by existing benchmarks or datasets. The annotator collects the raw image/video data from existing datasets but does not use the original annotation. The annotator redesigns/repurposes the data by writing concrete task descriptions and creating new questions and answers, or using scripts to re-process the data for the designed task.
|
59 |
+
- **Directly converted from existing benchmarks:** The task is directly converted from existing benchmarks or datasets. The annotator randomly samples a subset from the existing benchmark, directly using its image/video and the annotation without redesign.
|
60 |
+
|
61 |
+
In our annotation process, the first two task types are encouraged. The task reviewers strictly control the number of the third type and reject the task if an annotator submits many tasks of the third type.
|
62 |
+
|
63 |
+
Please refer to Table 17 of our [paper](https://arxiv.org/abs/2410.10563) for the detailed data source of all tasks in MEGA-Bench.
|
64 |
+
"""
|
65 |
+
|
66 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite our paper and evaluation results below"
|
67 |
+
CITATION_BUTTON_TEXT = r"""
|
68 |
+
@article{chen2024mega-bench,
|
69 |
+
title={MEGA-Bench: Scaling Multimodal Evaluation to over 500 Real-World Tasks},
|
70 |
+
author={Chen, Jiacheng and Liang, Tianhao and Siu, Sherman and Wang, Zhengqing and Wang, Kai and Wang, Yubo and Ni, Yuansheng and Zhu, Wang and Jiang, Ziyan and Lyu, Bohan and Jiang, Dongfu and He, Xuan and Liu, Yuan and Hu, Hexiang and Yue, Xiang and Chen, Wenhu},
|
71 |
+
journal={arXiv preprint arXiv:2410.10563},
|
72 |
+
year={2024},
|
73 |
+
}
|
74 |
+
"""
|
75 |
+
|
76 |
+
SUBMIT_INTRODUCTION = """# Submit on MEGA-Bench Leaderboard
|
77 |
+
|
78 |
+
We will provide details on how to submitting the results files once our evaluation pipeline is released on our [GitHub repository](https://github.com/TIGER-AI-Lab/MEGA-Bench).
|
79 |
+
|
80 |
+
|
81 |
+
"""
|
requirements.txt
CHANGED
@@ -1,16 +1,18 @@
|
|
1 |
-
APScheduler
|
2 |
-
black
|
3 |
-
|
4 |
-
|
5 |
-
gradio
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
tqdm
|
14 |
-
transformers
|
15 |
tokenizers>=0.15.0
|
|
|
|
|
16 |
sentencepiece
|
|
|
1 |
+
APScheduler==3.10.1
|
2 |
+
black==23.11.0
|
3 |
+
click==8.1.3
|
4 |
+
datasets==2.14.5
|
5 |
+
gradio==5.1.0
|
6 |
+
gradio_client==1.4.0
|
7 |
+
huggingface-hub==0.25.2
|
8 |
+
matplotlib==3.7.1
|
9 |
+
numpy==1.24.2
|
10 |
+
pandas==2.0.0
|
11 |
+
python-dateutil==2.8.2
|
12 |
+
requests==2.28.2
|
13 |
+
tqdm==4.65.0
|
14 |
+
transformers==4.35.2
|
15 |
tokenizers>=0.15.0
|
16 |
+
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
+
accelerate==0.24.1
|
18 |
sentencepiece
|
static/css/style.css
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.custom-dataframe {
|
2 |
+
width: 100% !important;
|
3 |
+
overflow-x: auto !important;
|
4 |
+
}
|
5 |
+
|
6 |
+
.custom-dataframe table {
|
7 |
+
width: auto !important;
|
8 |
+
min-width: 100% !important;
|
9 |
+
font-size: 14px !important;
|
10 |
+
}
|
11 |
+
|
12 |
+
.custom-dataframe thead th {
|
13 |
+
padding: 4px 8px !important;
|
14 |
+
text-align: center !important;
|
15 |
+
vertical-align: middle !important;
|
16 |
+
white-space: nowrap !important;
|
17 |
+
overflow: visible !important;
|
18 |
+
font-size: 12px !important;
|
19 |
+
font-weight: bold !important;
|
20 |
+
line-height: 1.2 !important;
|
21 |
+
}
|
22 |
+
|
23 |
+
.custom-dataframe tbody td {
|
24 |
+
padding: 4px 8px !important;
|
25 |
+
text-align: right !important;
|
26 |
+
vertical-align: middle !important;
|
27 |
+
white-space: nowrap !important;
|
28 |
+
overflow: visible !important;
|
29 |
+
line-height: 1.2 !important;
|
30 |
+
}
|
31 |
+
|
32 |
+
.custom-dataframe tbody td:first-child {
|
33 |
+
text-align: left !important;
|
34 |
+
}
|
35 |
+
|
36 |
+
/* Adjust the sort indicator position */
|
37 |
+
.custom-dataframe thead th::after {
|
38 |
+
font-size: 12px !important;
|
39 |
+
line-height: 1 !important;
|
40 |
+
margin-left: 4px !important;
|
41 |
+
}
|
42 |
+
|
43 |
+
/* Style for global result columns */
|
44 |
+
.custom-dataframe thead th:nth-child(-n+4),
|
45 |
+
.custom-dataframe tbody td:nth-child(-n+4) {
|
46 |
+
background-color: #f0f8ff !important; /* Light blue background */
|
47 |
+
}
|
48 |
+
|
49 |
+
/* Style for dimension-specific result columns */
|
50 |
+
.custom-dataframe thead th:nth-child(n+5),
|
51 |
+
.custom-dataframe tbody td:nth-child(n+5) {
|
52 |
+
background-color: #f0fff0 !important; /* Light green background */
|
53 |
+
}
|
54 |
+
|
55 |
+
/* Alternating row colors for better readability */
|
56 |
+
.custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+4) {
|
57 |
+
background-color: #e6f3ff !important; /* Slightly darker light blue */
|
58 |
+
}
|
59 |
+
|
60 |
+
.custom-dataframe tbody tr:nth-child(even) td:nth-child(n+5) {
|
61 |
+
background-color: #e6ffe6 !important; /* Slightly darker light green */
|
62 |
+
}
|
static/eval_results/all_model_keywords_stats.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/all_summary.json
ADDED
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"GPT_4o": {
|
3 |
+
"core_noncot": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"num_not_eval_samples": 0,
|
7 |
+
"num_total_samples": 6961,
|
8 |
+
"macro_mean_score": 0.5187898818829914,
|
9 |
+
"micro_mean_score": 0.5127977300993917
|
10 |
+
},
|
11 |
+
"core_cot": {
|
12 |
+
"num_eval_tasks": 440,
|
13 |
+
"num_eval_samples": 6539,
|
14 |
+
"num_not_eval_samples": 0,
|
15 |
+
"num_total_samples": 6961,
|
16 |
+
"macro_mean_score": 0.5251654337401854,
|
17 |
+
"micro_mean_score": 0.522332974147119
|
18 |
+
},
|
19 |
+
"open": {
|
20 |
+
"num_eval_tasks": 65,
|
21 |
+
"num_eval_samples": 1163,
|
22 |
+
"num_total_samples": 2448,
|
23 |
+
"macro_mean_score": 0.6478225794744895,
|
24 |
+
"micro_mean_score": 0.665391229578676
|
25 |
+
},
|
26 |
+
"overall_score": 0.5409529871515315
|
27 |
+
},
|
28 |
+
"Gemini_1.5_pro_002": {
|
29 |
+
"core_noncot": {
|
30 |
+
"num_eval_tasks": 440,
|
31 |
+
"num_eval_samples": 6539,
|
32 |
+
"num_not_eval_samples": 0,
|
33 |
+
"num_total_samples": 6961,
|
34 |
+
"macro_mean_score": 0.46887846869580546,
|
35 |
+
"micro_mean_score": 0.46403536258864253
|
36 |
+
},
|
37 |
+
"core_cot": {
|
38 |
+
"num_eval_tasks": 440,
|
39 |
+
"num_eval_samples": 6539,
|
40 |
+
"num_not_eval_samples": 0,
|
41 |
+
"num_total_samples": 6961,
|
42 |
+
"macro_mean_score": 0.481393687771543,
|
43 |
+
"micro_mean_score": 0.4756661334397647
|
44 |
+
},
|
45 |
+
"open": {
|
46 |
+
"num_eval_tasks": 65,
|
47 |
+
"num_eval_samples": 1163,
|
48 |
+
"num_total_samples": 2448,
|
49 |
+
"macro_mean_score": 0.5858190649927173,
|
50 |
+
"micro_mean_score": 0.6104901117798793
|
51 |
+
},
|
52 |
+
"overall_score": 0.4948345779089219
|
53 |
+
},
|
54 |
+
"Gemini_1.5_flash_002": {
|
55 |
+
"core_noncot": {
|
56 |
+
"num_eval_tasks": 440,
|
57 |
+
"num_eval_samples": 6539,
|
58 |
+
"num_not_eval_samples": 0,
|
59 |
+
"num_total_samples": 6961,
|
60 |
+
"macro_mean_score": 0.4183865592515826,
|
61 |
+
"micro_mean_score": 0.41216971462683855
|
62 |
+
},
|
63 |
+
"core_cot": {
|
64 |
+
"num_eval_tasks": 440,
|
65 |
+
"num_eval_samples": 6539,
|
66 |
+
"num_not_eval_samples": 0,
|
67 |
+
"num_total_samples": 6961,
|
68 |
+
"macro_mean_score": 0.4183865592515826,
|
69 |
+
"micro_mean_score": 0.41216971462683855
|
70 |
+
},
|
71 |
+
"open": {
|
72 |
+
"num_eval_tasks": 65,
|
73 |
+
"num_eval_samples": 1163,
|
74 |
+
"num_total_samples": 2168,
|
75 |
+
"macro_mean_score": 0.5691365176285039,
|
76 |
+
"micro_mean_score": 0.5987532244196045
|
77 |
+
},
|
78 |
+
"overall_score": 0.4377900192406913
|
79 |
+
},
|
80 |
+
"Claude_3.5": {
|
81 |
+
"core_noncot": {
|
82 |
+
"num_eval_tasks": 440,
|
83 |
+
"num_eval_samples": 6539,
|
84 |
+
"num_not_eval_samples": 0,
|
85 |
+
"num_total_samples": 6961,
|
86 |
+
"macro_mean_score": 0.4863241841253708,
|
87 |
+
"micro_mean_score": 0.4798092874490549
|
88 |
+
},
|
89 |
+
"core_cot": {
|
90 |
+
"num_eval_tasks": 440,
|
91 |
+
"num_eval_samples": 6539,
|
92 |
+
"num_not_eval_samples": 0,
|
93 |
+
"num_total_samples": 6961,
|
94 |
+
"macro_mean_score": 0.5023557473841108,
|
95 |
+
"micro_mean_score": 0.4985442599850241
|
96 |
+
},
|
97 |
+
"open": {
|
98 |
+
"num_eval_tasks": 65,
|
99 |
+
"num_eval_samples": 1163,
|
100 |
+
"num_total_samples": 2288,
|
101 |
+
"macro_mean_score": 0.6373907158949892,
|
102 |
+
"micro_mean_score": 0.6569647463456579
|
103 |
+
},
|
104 |
+
"overall_score": 0.519736485905313
|
105 |
+
},
|
106 |
+
"GPT_4o_mini": {
|
107 |
+
"core_noncot": {
|
108 |
+
"num_eval_tasks": 440,
|
109 |
+
"num_eval_samples": 6539,
|
110 |
+
"num_not_eval_samples": 0,
|
111 |
+
"num_total_samples": 6961,
|
112 |
+
"macro_mean_score": 0.3974259652331149,
|
113 |
+
"micro_mean_score": 0.392578163407945
|
114 |
+
},
|
115 |
+
"core_cot": {
|
116 |
+
"num_eval_tasks": 440,
|
117 |
+
"num_eval_samples": 6539,
|
118 |
+
"num_not_eval_samples": 0,
|
119 |
+
"num_total_samples": 6961,
|
120 |
+
"macro_mean_score": 0.4070959243997505,
|
121 |
+
"micro_mean_score": 0.40376078514357017
|
122 |
+
},
|
123 |
+
"open": {
|
124 |
+
"num_eval_tasks": 65,
|
125 |
+
"num_eval_samples": 1163,
|
126 |
+
"num_total_samples": 1224,
|
127 |
+
"macro_mean_score": 0.586537827213665,
|
128 |
+
"micro_mean_score": 0.6133276010318144
|
129 |
+
},
|
130 |
+
"overall_score": 0.43019240694015537
|
131 |
+
},
|
132 |
+
"Qwen2_VL_72B": {
|
133 |
+
"core_noncot": {
|
134 |
+
"num_eval_tasks": 440,
|
135 |
+
"num_eval_samples": 6539,
|
136 |
+
"num_not_eval_samples": 0,
|
137 |
+
"num_total_samples": 6961,
|
138 |
+
"macro_mean_score": 0.4623988230573754,
|
139 |
+
"micro_mean_score": 0.4568583770401895
|
140 |
+
},
|
141 |
+
"core_cot": {
|
142 |
+
"num_eval_tasks": 440,
|
143 |
+
"num_eval_samples": 6539,
|
144 |
+
"num_not_eval_samples": 0,
|
145 |
+
"num_total_samples": 6961,
|
146 |
+
"macro_mean_score": 0.45284699372478177,
|
147 |
+
"micro_mean_score": 0.4487693487093462
|
148 |
+
},
|
149 |
+
"open": {
|
150 |
+
"num_eval_tasks": 65,
|
151 |
+
"num_eval_samples": 1163,
|
152 |
+
"num_total_samples": 2448,
|
153 |
+
"macro_mean_score": 0.5639771804231668,
|
154 |
+
"micro_mean_score": 0.5835339638865004
|
155 |
+
},
|
156 |
+
"overall_score": 0.4754732650945565
|
157 |
+
},
|
158 |
+
"Qwen2_VL_7B": {
|
159 |
+
"core_noncot": {
|
160 |
+
"num_eval_tasks": 440,
|
161 |
+
"num_eval_samples": 6539,
|
162 |
+
"num_not_eval_samples": 0,
|
163 |
+
"num_total_samples": 6961,
|
164 |
+
"macro_mean_score": 0.34725455697890745,
|
165 |
+
"micro_mean_score": 0.34344091516995323
|
166 |
+
},
|
167 |
+
"core_cot": {
|
168 |
+
"num_eval_tasks": 440,
|
169 |
+
"num_eval_samples": 6539,
|
170 |
+
"num_not_eval_samples": 0,
|
171 |
+
"num_total_samples": 6961,
|
172 |
+
"macro_mean_score": 0.3284357723853296,
|
173 |
+
"micro_mean_score": 0.32443422147119677
|
174 |
+
},
|
175 |
+
"open": {
|
176 |
+
"num_eval_tasks": 65,
|
177 |
+
"num_eval_samples": 1170,
|
178 |
+
"num_total_samples": 2452,
|
179 |
+
"macro_mean_score": 0.43955105763038577,
|
180 |
+
"micro_mean_score": 0.45508547008546996
|
181 |
+
},
|
182 |
+
"overall_score": 0.35913430458751355
|
183 |
+
},
|
184 |
+
"llava_onevision_72B": {
|
185 |
+
"core_noncot": {
|
186 |
+
"num_eval_tasks": 440,
|
187 |
+
"num_eval_samples": 6539,
|
188 |
+
"num_not_eval_samples": 0,
|
189 |
+
"num_total_samples": 6961,
|
190 |
+
"macro_mean_score": 0.31960132549012704,
|
191 |
+
"micro_mean_score": 0.3173848563095166
|
192 |
+
},
|
193 |
+
"core_cot": {
|
194 |
+
"num_eval_tasks": 440,
|
195 |
+
"num_eval_samples": 6539,
|
196 |
+
"num_not_eval_samples": 0,
|
197 |
+
"num_total_samples": 6961,
|
198 |
+
"macro_mean_score": 0.29725827011768174,
|
199 |
+
"micro_mean_score": 0.2954433666362564
|
200 |
+
},
|
201 |
+
"open": {
|
202 |
+
"num_eval_tasks": 65,
|
203 |
+
"num_eval_samples": 1163,
|
204 |
+
"num_total_samples": 1224,
|
205 |
+
"macro_mean_score": 0.4599484231632498,
|
206 |
+
"micro_mean_score": 0.4850386930352536
|
207 |
+
},
|
208 |
+
"overall_score": 0.33766580340844976
|
209 |
+
},
|
210 |
+
"llava_onevision_7B": {
|
211 |
+
"core_noncot": {
|
212 |
+
"num_eval_tasks": 440,
|
213 |
+
"num_eval_samples": 6539,
|
214 |
+
"num_not_eval_samples": 0,
|
215 |
+
"num_total_samples": 6961,
|
216 |
+
"macro_mean_score": 0.2239290419841492,
|
217 |
+
"micro_mean_score": 0.22222171180488767
|
218 |
+
},
|
219 |
+
"core_cot": {
|
220 |
+
"num_eval_tasks": 440,
|
221 |
+
"num_eval_samples": 6539,
|
222 |
+
"num_not_eval_samples": 0,
|
223 |
+
"num_total_samples": 6961,
|
224 |
+
"macro_mean_score": 0.21347545703998197,
|
225 |
+
"micro_mean_score": 0.210586172002703
|
226 |
+
},
|
227 |
+
"open": {
|
228 |
+
"num_eval_tasks": 65,
|
229 |
+
"num_eval_samples": 1163,
|
230 |
+
"num_total_samples": 2448,
|
231 |
+
"macro_mean_score": 0.33979975321921935,
|
232 |
+
"micro_mean_score": 0.36474634565778147
|
233 |
+
},
|
234 |
+
"overall_score": 0.23884309392529685
|
235 |
+
},
|
236 |
+
"InternVL2_76B": {
|
237 |
+
"core_noncot": {
|
238 |
+
"num_eval_tasks": 440,
|
239 |
+
"num_eval_samples": 6539,
|
240 |
+
"num_not_eval_samples": 0,
|
241 |
+
"num_total_samples": 6961,
|
242 |
+
"macro_mean_score": 0.34977582844066846,
|
243 |
+
"micro_mean_score": 0.3452353155814884
|
244 |
+
},
|
245 |
+
"core_cot": {
|
246 |
+
"num_eval_tasks": 440,
|
247 |
+
"num_eval_samples": 6539,
|
248 |
+
"num_not_eval_samples": 0,
|
249 |
+
"num_total_samples": 6961,
|
250 |
+
"macro_mean_score": 0.35539585884136143,
|
251 |
+
"micro_mean_score": 0.35043335903915124
|
252 |
+
},
|
253 |
+
"open": {
|
254 |
+
"num_eval_tasks": 65,
|
255 |
+
"num_eval_samples": 1163,
|
256 |
+
"num_total_samples": 1224,
|
257 |
+
"macro_mean_score": 0.5192997443033639,
|
258 |
+
"micro_mean_score": 0.5421324161650903
|
259 |
+
},
|
260 |
+
"overall_score": 0.37649239855429245
|
261 |
+
},
|
262 |
+
"InternVL2_8B": {
|
263 |
+
"core_noncot": {
|
264 |
+
"num_eval_tasks": 440,
|
265 |
+
"num_eval_samples": 6539,
|
266 |
+
"num_not_eval_samples": 0,
|
267 |
+
"num_total_samples": 6961,
|
268 |
+
"macro_mean_score": 0.25920867490737526,
|
269 |
+
"micro_mean_score": 0.2543416126895087
|
270 |
+
},
|
271 |
+
"core_cot": {
|
272 |
+
"num_eval_tasks": 440,
|
273 |
+
"num_eval_samples": 6539,
|
274 |
+
"num_not_eval_samples": 0,
|
275 |
+
"num_total_samples": 6961,
|
276 |
+
"macro_mean_score": 0.24055897165959364,
|
277 |
+
"micro_mean_score": 0.23784634936127952
|
278 |
+
},
|
279 |
+
"open": {
|
280 |
+
"num_eval_tasks": 65,
|
281 |
+
"num_eval_samples": 1165,
|
282 |
+
"num_total_samples": 2452,
|
283 |
+
"macro_mean_score": 0.3978571701460552,
|
284 |
+
"micro_mean_score": 0.4108583690987125
|
285 |
+
},
|
286 |
+
"overall_score": 0.2770545208291856
|
287 |
+
},
|
288 |
+
"MiniCPM_v2.6": {
|
289 |
+
"core_noncot": {
|
290 |
+
"num_eval_tasks": 440,
|
291 |
+
"num_eval_samples": 6539,
|
292 |
+
"num_not_eval_samples": 0,
|
293 |
+
"num_total_samples": 6961,
|
294 |
+
"macro_mean_score": 0.22838207666977445,
|
295 |
+
"micro_mean_score": 0.22452805919103805
|
296 |
+
},
|
297 |
+
"core_cot": {
|
298 |
+
"num_eval_tasks": 440,
|
299 |
+
"num_eval_samples": 6539,
|
300 |
+
"num_not_eval_samples": 0,
|
301 |
+
"num_total_samples": 6961,
|
302 |
+
"macro_mean_score": 0.22901463640480854,
|
303 |
+
"micro_mean_score": 0.2250606411323753
|
304 |
+
},
|
305 |
+
"open": {
|
306 |
+
"num_eval_tasks": 65,
|
307 |
+
"num_eval_samples": 1163,
|
308 |
+
"num_total_samples": 2448,
|
309 |
+
"macro_mean_score": 0.41728623355613875,
|
310 |
+
"micro_mean_score": 0.43452278589853827
|
311 |
+
},
|
312 |
+
"overall_score": 0.25324761425596987
|
313 |
+
},
|
314 |
+
"Phi-3.5-vision": {
|
315 |
+
"core_noncot": {
|
316 |
+
"num_eval_tasks": 440,
|
317 |
+
"num_eval_samples": 6539,
|
318 |
+
"num_not_eval_samples": 0,
|
319 |
+
"num_total_samples": 6961,
|
320 |
+
"macro_mean_score": 0.23240864879023493,
|
321 |
+
"micro_mean_score": 0.22932978620408923
|
322 |
+
},
|
323 |
+
"core_cot": {
|
324 |
+
"num_eval_tasks": 440,
|
325 |
+
"num_eval_samples": 6539,
|
326 |
+
"num_not_eval_samples": 0,
|
327 |
+
"num_total_samples": 6961,
|
328 |
+
"macro_mean_score": 0.2295097914016776,
|
329 |
+
"micro_mean_score": 0.2266573336398296
|
330 |
+
},
|
331 |
+
"open": {
|
332 |
+
"num_eval_tasks": 65,
|
333 |
+
"num_eval_samples": 1163,
|
334 |
+
"num_total_samples": 2428,
|
335 |
+
"macro_mean_score": 0.3947914647737769,
|
336 |
+
"micro_mean_score": 0.42459157351676696
|
337 |
+
},
|
338 |
+
"overall_score": 0.2533094072831661
|
339 |
+
},
|
340 |
+
"Pixtral_12B": {
|
341 |
+
"core_noncot": {
|
342 |
+
"num_eval_tasks": 440,
|
343 |
+
"num_eval_samples": 6539,
|
344 |
+
"num_not_eval_samples": 0,
|
345 |
+
"num_total_samples": 6961,
|
346 |
+
"macro_mean_score": 0.3186510310643637,
|
347 |
+
"micro_mean_score": 0.3151734861550665
|
348 |
+
},
|
349 |
+
"core_cot": {
|
350 |
+
"num_eval_tasks": 440,
|
351 |
+
"num_eval_samples": 6539,
|
352 |
+
"num_not_eval_samples": 0,
|
353 |
+
"num_total_samples": 6961,
|
354 |
+
"macro_mean_score": 0.3132232487306254,
|
355 |
+
"micro_mean_score": 0.30971424472967524
|
356 |
+
},
|
357 |
+
"open": {
|
358 |
+
"num_eval_tasks": 65,
|
359 |
+
"num_eval_samples": 1163,
|
360 |
+
"num_total_samples": 1224,
|
361 |
+
"macro_mean_score": 0.4566234428542061,
|
362 |
+
"micro_mean_score": 0.4870593293207223
|
363 |
+
},
|
364 |
+
"overall_score": 0.3364098563442444
|
365 |
+
},
|
366 |
+
"Llama_3_2_11B": {
|
367 |
+
"core_noncot": {
|
368 |
+
"num_eval_tasks": 440,
|
369 |
+
"num_eval_samples": 6539,
|
370 |
+
"num_not_eval_samples": 0,
|
371 |
+
"num_total_samples": 6961,
|
372 |
+
"macro_mean_score": 0.10044261716549671,
|
373 |
+
"micro_mean_score": 0.09980638766828835
|
374 |
+
},
|
375 |
+
"core_cot": {
|
376 |
+
"num_eval_tasks": 440,
|
377 |
+
"num_eval_samples": 6539,
|
378 |
+
"num_not_eval_samples": 0,
|
379 |
+
"num_total_samples": 6961,
|
380 |
+
"macro_mean_score": 0.15984490401619783,
|
381 |
+
"micro_mean_score": 0.15794038158731832
|
382 |
+
},
|
383 |
+
"open": {
|
384 |
+
"num_eval_tasks": 65,
|
385 |
+
"num_eval_samples": 1163,
|
386 |
+
"num_total_samples": 1224,
|
387 |
+
"macro_mean_score": 0.3173342406187366,
|
388 |
+
"micro_mean_score": 0.3487962166809973
|
389 |
+
},
|
390 |
+
"overall_score": 0.1801158087274157
|
391 |
+
},
|
392 |
+
"Idefics3": {
|
393 |
+
"core_noncot": {
|
394 |
+
"num_eval_tasks": 440,
|
395 |
+
"num_eval_samples": 6539,
|
396 |
+
"num_not_eval_samples": 0,
|
397 |
+
"num_total_samples": 6961,
|
398 |
+
"macro_mean_score": 0.11118980301103833,
|
399 |
+
"micro_mean_score": 0.11201785633274061
|
400 |
+
},
|
401 |
+
"core_cot": {
|
402 |
+
"num_eval_tasks": 440,
|
403 |
+
"num_eval_samples": 6539,
|
404 |
+
"num_not_eval_samples": 0,
|
405 |
+
"num_total_samples": 6961,
|
406 |
+
"macro_mean_score": 0.08956972487602757,
|
407 |
+
"micro_mean_score": 0.08982225274252693
|
408 |
+
},
|
409 |
+
"open": {
|
410 |
+
"num_eval_tasks": 65,
|
411 |
+
"num_eval_samples": 1163,
|
412 |
+
"num_total_samples": 2448,
|
413 |
+
"macro_mean_score": 0.3210866162255635,
|
414 |
+
"micro_mean_score": 0.35649183147033553
|
415 |
+
},
|
416 |
+
"overall_score": 0.138206224513898
|
417 |
+
}
|
418 |
+
}
|
utils.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import gradio as gr
|
3 |
+
import csv
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import shutil
|
7 |
+
from huggingface_hub import Repository
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
# Load the JSON data
|
11 |
+
with open("./static/eval_results/all_model_keywords_stats.json", "r") as f:
|
12 |
+
MODEL_DATA = json.load(f)
|
13 |
+
|
14 |
+
with open("./static/eval_results/all_summary.json", "r") as f:
|
15 |
+
SUMMARY_DATA = json.load(f)
|
16 |
+
|
17 |
+
|
18 |
+
# Define model name mapping
|
19 |
+
MODEL_NAME_MAP = {
|
20 |
+
"GPT_4o": "GPT-4o (0513)",
|
21 |
+
"Claude_3.5": "Claude-3.5-Sonnet",
|
22 |
+
"Gemini_1.5_pro_002": "Gemini-1.5-Pro-002",
|
23 |
+
"InternVL2_76B": "InternVL2-Llama3-76B",
|
24 |
+
"Qwen2_VL_72B": "Qwen2-VL-72B",
|
25 |
+
"llava_onevision_72B": "Llava-OneVision-72B",
|
26 |
+
"GPT_4o_mini": "GPT-4o mini",
|
27 |
+
"Gemini_1.5_flash_002": "Gemini-1.5-Flash-002",
|
28 |
+
"Pixtral_12B": "Pixtral 12B",
|
29 |
+
"Qwen2_VL_7B": "Qwen2-VL-7B",
|
30 |
+
"InternVL2_8B": "InternVL2-8B",
|
31 |
+
"llava_onevision_7B": "Llava-OneVision-7B",
|
32 |
+
"Llama_3_2_11B": "Llama-3.2-11B",
|
33 |
+
"Phi-3.5-vision": "Phi-3.5-Vision",
|
34 |
+
"MiniCPM_v2.6": "MiniCPM-V2.6",
|
35 |
+
"Idefics3": "Idefics3-8B-Llama3",
|
36 |
+
}
|
37 |
+
|
38 |
+
# Custom name mapping for dimensions and keywords
|
39 |
+
DIMENSION_NAME_MAP = {
|
40 |
+
"skills": "Skills",
|
41 |
+
"input_format": "Input Format",
|
42 |
+
"output_format": "Output Format",
|
43 |
+
"input_num": "Visual Input Number",
|
44 |
+
"app": "Application"
|
45 |
+
}
|
46 |
+
|
47 |
+
KEYWORD_NAME_MAP = {
|
48 |
+
# Skills
|
49 |
+
"Object Recognition and Classification": "Object Recognition",
|
50 |
+
"Text Recognition (OCR)": "OCR",
|
51 |
+
"Language Understanding and Generation": "Language",
|
52 |
+
"Scene and Event Understanding": "Scene/Event",
|
53 |
+
"Mathematical and Logical Reasoning": "Math/Logic",
|
54 |
+
"Commonsense and Social Reasoning": "Commonsense",
|
55 |
+
"Ethical and Safety Reasoning": "Ethics/Safety",
|
56 |
+
"Domain-Specific Knowledge and Skills": "Domain-Specific",
|
57 |
+
"Spatial and Temporal Reasoning": "Spatial/Temporal",
|
58 |
+
"Planning and Decision Making": "Planning/Decision",
|
59 |
+
# Input Format
|
60 |
+
'User Interface Screenshots': "UI related",
|
61 |
+
'Text-Based Images and Documents': "Documents",
|
62 |
+
'Diagrams and Data Visualizations': "Infographics",
|
63 |
+
'Videos': "Videos",
|
64 |
+
'Artistic and Creative Content': "Arts/Creative",
|
65 |
+
'Photographs': "Photographs",
|
66 |
+
'3D Models and Aerial Imagery': "3D related",
|
67 |
+
# Application
|
68 |
+
'Information_Extraction': "Info Extraction",
|
69 |
+
'Planning' : "Planning",
|
70 |
+
'Coding': "Coding",
|
71 |
+
'Perception': "Perception",
|
72 |
+
'Metrics': "Metrics",
|
73 |
+
'Science': "Science",
|
74 |
+
'Knowledge': "Knowledge",
|
75 |
+
'Mathematics': "Math",
|
76 |
+
# Output format
|
77 |
+
'contextual_formatted_text': "Contexual",
|
78 |
+
'structured_output': "Structured",
|
79 |
+
'exact_text': "Exact",
|
80 |
+
'numerical_data': "Numerical",
|
81 |
+
'open_ended_output': "Open-ended",
|
82 |
+
'multiple_choice': "MC",
|
83 |
+
"6-8 images": "6-8 imgs",
|
84 |
+
"1-image": "1 img",
|
85 |
+
"2-3 images": "2-3 imgs",
|
86 |
+
"4-5 images": "4-5 imgs",
|
87 |
+
"9-image or more": "9+ imgs",
|
88 |
+
"video": "Video",
|
89 |
+
}
|
90 |
+
|
91 |
+
# Extract super groups (dimensions) and their keywords
|
92 |
+
SUPER_GROUPS = {DIMENSION_NAME_MAP[dim]: [KEYWORD_NAME_MAP.get(k, k) for k in MODEL_DATA[next(iter(MODEL_DATA))][dim].keys()]
|
93 |
+
for dim in MODEL_DATA[next(iter(MODEL_DATA))]}
|
94 |
+
|
95 |
+
SUBMISSION_NAME = "test_leaderboard_submission"
|
96 |
+
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/cccjc/", SUBMISSION_NAME)
|
97 |
+
CSV_DIR = "./test_leaderboard_submission/results.csv"
|
98 |
+
|
99 |
+
def get_original_dimension(mapped_dimension):
|
100 |
+
return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
|
101 |
+
|
102 |
+
def get_original_keyword(mapped_keyword):
|
103 |
+
return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
|
104 |
+
|
105 |
+
# Define model groups
|
106 |
+
MODEL_GROUPS = {
|
107 |
+
"All": list(MODEL_DATA.keys()),
|
108 |
+
"Flagship Models": ['GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B'],
|
109 |
+
"Efficienty Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
|
110 |
+
"Proprietary Flagship models": ['GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'],
|
111 |
+
"Open-source Efficienty Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
|
112 |
+
"Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B'],
|
113 |
+
"Proprietary Efficienty Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
|
114 |
+
}
|
115 |
+
|
116 |
+
def get_display_model_name(model_name):
|
117 |
+
return MODEL_NAME_MAP.get(model_name, model_name)
|
118 |
+
|
119 |
+
def get_df(selected_super_group, selected_model_group):
|
120 |
+
original_dimension = get_original_dimension(selected_super_group)
|
121 |
+
data = []
|
122 |
+
for model in MODEL_GROUPS[selected_model_group]:
|
123 |
+
model_data = MODEL_DATA[model]
|
124 |
+
summary = SUMMARY_DATA[model]
|
125 |
+
core_score = max(summary["core_noncot"]["macro_mean_score"], summary["core_cot"]["macro_mean_score"])
|
126 |
+
row = {
|
127 |
+
"Models": get_display_model_name(model), # Use the mapped name
|
128 |
+
"Overall": round(summary["overall_score"] * 100, 2),
|
129 |
+
"Core": round(core_score * 100, 2),
|
130 |
+
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
|
131 |
+
}
|
132 |
+
for keyword in SUPER_GROUPS[selected_super_group]:
|
133 |
+
original_keyword = get_original_keyword(keyword)
|
134 |
+
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
|
135 |
+
row[keyword] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
|
136 |
+
else:
|
137 |
+
row[keyword] = None
|
138 |
+
data.append(row)
|
139 |
+
|
140 |
+
df = pd.DataFrame(data)
|
141 |
+
df = df.sort_values(by="Overall", ascending=False)
|
142 |
+
return df
|
143 |
+
|
144 |
+
def get_leaderboard_data(selected_super_group, selected_model_group):
|
145 |
+
df = get_df(selected_super_group, selected_model_group)
|
146 |
+
headers = ["Models", "Overall", "Core", "Open-ended"] + SUPER_GROUPS[selected_super_group]
|
147 |
+
data = df[headers].values.tolist()
|
148 |
+
return headers, data
|