zhouxiangxin1998 commited on
Commit
c78ecd5
1 Parent(s): 79c3ea8

first commit

Browse files
README.md CHANGED
@@ -9,36 +9,4 @@ pinned: true
9
  license: cc-by-nc-4.0
10
  ---
11
 
12
- # Start the configuration
13
-
14
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
15
-
16
- Results files should have the following format and be stored as json files:
17
- ```json
18
- {
19
- "config": {
20
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
21
- "model_name": "path of the model on the hub: org/model",
22
- "model_sha": "revision on the hub",
23
- },
24
- "results": {
25
- "task_name": {
26
- "metric_name": score,
27
- },
28
- "task_name2": {
29
- "metric_name": score,
30
- }
31
- }
32
- }
33
- ```
34
-
35
- Request files are created automatically by this tool.
36
-
37
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
38
-
39
- # Code logic for more complex edits
40
-
41
- You'll find
42
- - the main table' columns names and properties in `src/display/utils.py`
43
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
44
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
9
  license: cc-by-nc-4.0
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,201 +1,114 @@
 
 
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
  )
15
  from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
 
 
 
 
 
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- print(LEADERBOARD_DF)
55
-
56
- (
57
- finished_eval_queue_df,
58
- running_eval_queue_df,
59
- pending_eval_queue_df,
60
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
61
-
62
- def init_leaderboard(dataframe):
63
- if dataframe is None or dataframe.empty:
64
- raise ValueError("Leaderboard DataFrame is empty or None.")
65
- return Leaderboard(
66
- value=dataframe,
67
- datatype=[c.type for c in fields(AutoEvalColumn)],
68
- select_columns=SelectColumns(
69
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
70
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
71
- label="Select Columns to Display:",
72
- ),
73
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
74
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
75
- filter_columns=[
76
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
77
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
78
- ColumnFilter(
79
- AutoEvalColumn.params.name,
80
- type="slider",
81
- min=0.01,
82
- max=150,
83
- label="Select the number of parameters (B)",
84
- ),
85
- ColumnFilter(
86
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
87
- ),
88
- ],
89
- bool_checkboxgroup_label="Hide models",
90
- interactive=False,
91
- )
92
-
93
 
94
  demo = gr.Blocks(css=custom_css)
95
  with demo:
96
- gr.HTML(TITLE)
97
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
98
 
99
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
100
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
101
- leaderboard = init_leaderboard(LEADERBOARD_DF)
102
-
103
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
104
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
105
-
106
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
107
- with gr.Column():
108
- with gr.Row():
109
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
110
-
111
- with gr.Column():
112
- with gr.Accordion(
113
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
114
- open=False,
115
- ):
116
- with gr.Row():
117
- finished_eval_table = gr.components.Dataframe(
118
- value=finished_eval_queue_df,
119
- headers=EVAL_COLS,
120
- datatype=EVAL_TYPES,
121
- row_count=5,
122
- )
123
- with gr.Accordion(
124
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
125
- open=False,
126
- ):
127
- with gr.Row():
128
- running_eval_table = gr.components.Dataframe(
129
- value=running_eval_queue_df,
130
- headers=EVAL_COLS,
131
- datatype=EVAL_TYPES,
132
- row_count=5,
133
- )
134
-
135
- with gr.Accordion(
136
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
137
- open=False,
138
- ):
139
- with gr.Row():
140
- pending_eval_table = gr.components.Dataframe(
141
- value=pending_eval_queue_df,
142
- headers=EVAL_COLS,
143
- datatype=EVAL_TYPES,
144
- row_count=5,
145
- )
146
  with gr.Row():
147
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
148
-
 
 
149
  with gr.Row():
150
- with gr.Column():
151
- model_name_textbox = gr.Textbox(label="Model name")
152
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
153
- model_type = gr.Dropdown(
154
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
155
- label="Model type",
156
- multiselect=False,
157
- value=None,
158
- interactive=True,
159
- )
160
-
161
- with gr.Column():
162
- precision = gr.Dropdown(
163
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
164
- label="Precision",
165
- multiselect=False,
166
- value="float16",
167
- interactive=True,
168
- )
169
- weight_type = gr.Dropdown(
170
- choices=[i.value.name for i in WeightType],
171
- label="Weights type",
172
- multiselect=False,
173
- value="Original",
174
- interactive=True,
175
- )
176
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- submit_button = gr.Button("Submit Eval")
179
- submission_result = gr.Markdown()
180
- submit_button.click(
181
- add_new_eval,
182
- [
183
- model_name_textbox,
184
- base_model_name_textbox,
185
- revision_name_textbox,
186
- precision,
187
- weight_type,
188
- model_type,
189
- ],
190
- submission_result,
191
- )
192
 
193
  with gr.Row():
194
- with gr.Accordion("📙 Citation", open=False):
195
  citation_button = gr.Textbox(
196
  value=CITATION_BUTTON_TEXT,
197
  label=CITATION_BUTTON_LABEL,
198
- lines=20,
199
  elem_id="citation-button",
200
  show_copy_button=True,
201
  )
 
1
+ import os
2
+ import base64
3
  import gradio as gr
 
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
 
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
 
 
 
 
10
  )
11
  from src.display.css_html_js import custom_css
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ from src.envs import API, REPO_ID
14
+ current_dir = os.path.dirname(os.path.realpath(__file__))
15
+
16
+ with open(os.path.join(current_dir, "images/pb_logo.png"), "rb") as image_file:
17
+ main_logo = base64.b64encode(image_file.read()).decode('utf-8')
18
 
19
  def restart_space():
20
  API.restart_space(repo_id=REPO_ID)
21
 
22
+ TITLE="""
23
+ # ProteinBench: A Holistic Evaluation of Protein Foundation Models"""
24
+
25
+ INTRO_TEXT="""
26
+ Recent years have witnessed a surge in the development of protein foundation models,
27
+ significantly improving performance in protein prediction and generative tasks
28
+ ranging from 3D structure prediction and protein design to conformational dynamics.
29
+ However, the capabilities and limitations associated with these models remain poorly understood due to the absence of a unified evaluation framework.
30
+ To fill this gap, we introduce <b>ProteinBench</b>,
31
+ a holistic evaluation framework designed to enhance the transparency of protein foundation models.
32
+ Our approach consists of three key components:
33
+ (i) A taxonomic classification of tasks that broadly encompass the main challenges in the protein domain,
34
+ based on the relationships between different protein modalities;
35
+ (ii) A multi-metric evaluation approach that assesses performance across four key dimensions: quality, novelty, diversity, and robustness;
36
+ and (iii) In-depth analyses from various user objectives, providing a holistic view of model performance.
37
+ Our comprehensive evaluation of protein foundation models reveals several key findings that shed light on their current capabilities and limitations.
38
+ To promote transparency and facilitate further research, we release the evaluation dataset, code, and a public leaderboard publicly for further analysis
39
+ and a general modular toolkit. We intend for ProteinBench to be a living benchmark for establishing a standardized,
40
+ in-depth evaluation framework for protein foundation models, driving their development and application while fostering collaboration within the field.
41
+
42
+ ## [Paper](https://www.arxiv.org/pdf/2409.06744) | [Website](https://proteinbench.github.io/)
43
+ """
44
+
45
+ # ### Space initialisation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  demo = gr.Blocks(css=custom_css)
48
  with demo:
49
+ with gr.Row():
50
+ with gr.Column(scale=6):
51
+ gr.Markdown(TITLE)
52
+ with gr.Row():
53
+ with gr.Column(scale=6):
54
+ gr.Markdown(INTRO_TEXT)
55
+ with gr.Column(scale=1):
56
+ gr.HTML(f'<img src="data:image/jpeg;base64,{main_logo}" style="width:16em;vertical-align: middle"/>')
57
 
58
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
59
+ with gr.TabItem("🏆 Inverse Folding Leaderboard"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  with gr.Row():
61
+ inverse_folding_table = gr.DataFrame(
62
+ pd.read_csv('data/inverse_folding.csv')
63
+ )
64
+ with gr.TabItem("🏆 Structre Design Leaderboard"):
65
  with gr.Row():
66
+ inverse_folding_table = gr.DataFrame(
67
+ pd.read_csv('data/structure_design.csv')
68
+ )
69
+ with gr.TabItem("🏆 Sequence Design Leaderboard"):
70
+ with gr.Row():
71
+ inverse_folding_table = gr.DataFrame(
72
+ pd.read_csv('data/sequence_design.csv')
73
+ )
74
+ with gr.TabItem("🏆 Sequence-Structure Co-Design Leaderboard"):
75
+ with gr.Row():
76
+ inverse_folding_table = gr.DataFrame(
77
+ pd.read_csv('data/co_design.csv')
78
+ )
79
+ with gr.TabItem("🏆 Motif Scaffolding Leaderboard"):
80
+ with gr.Row():
81
+ inverse_folding_table = gr.DataFrame(
82
+ pd.read_csv('data/motif_scaffolding.csv')
83
+ )
84
+ with gr.TabItem("🏆 Antibody Design Leaderboard"):
85
+ with gr.Row():
86
+ inverse_folding_table = gr.DataFrame(
87
+ pd.read_csv('data/antibody_design.csv')
88
+ )
89
+ with gr.TabItem("🏅 Protein Folding Leaderboard"):
90
+ with gr.Row():
91
+ inverse_folding_table = gr.DataFrame(
92
+ pd.read_csv('data/protein_folding.csv')
93
+ )
94
+ with gr.TabItem("🏅 Multi-State Prediction Leaderboard"):
95
+ with gr.Row():
96
+ inverse_folding_table = gr.DataFrame(
97
+ pd.read_csv('data/multi_state_prediction.csv')
98
+ )
99
+ with gr.TabItem("🏅 Conformation Prediction Leaderboard"):
100
+ with gr.Row():
101
+ inverse_folding_table = gr.DataFrame(
102
+ pd.read_csv('data/conformation_prediction.csv')
103
+ )
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  with gr.Row():
107
+ with gr.Accordion("📙 Citation", open=True):
108
  citation_button = gr.Textbox(
109
  value=CITATION_BUTTON_TEXT,
110
  label=CITATION_BUTTON_LABEL,
111
+ lines=9,
112
  elem_id="citation-button",
113
  show_copy_button=True,
114
  )
data/antibody_design.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Model,AAR ↑,RMSD ↓,TM-score ↑,Binding Energy ↓,SeqSim-outer ↓,SeqSim-inner ↑,PHR ↓,CN-score ↑,Clashes-inner ↓,Clashes-outer ↓,SeqNat ↑,Total Energy ↓,scRMSD ↓
2
+ RAbD (natural),100.00%,0.00,1.00,-15.33,0.26,N/A,45.78%,50.19,0.07,0.00,-1.74,-16.76,1.77
3
+ HERN,33.17%,9.86,0.16,1242.77,0.41,N/A,39.83%,0.04,0.04,3.25,-1.47,5408.74,9.89
4
+ MEAN,33.47%,1.82,0.25,263.90,0.65,N/A,40.74%,1.33,11.65,0.29,-1.83,1077.32,2.77
5
+ dyMEAN,40.95%,2.36,0.36,889.28,0.58,N/A,42.04%,1.49,9.15,0.47,-1.79,1642.65,2.11
6
+ *dyMEAN-FixFR,40.05%,2.37,0.35,612.75,0.60,0.96,43.75%,1.14,8.88,0.48,-1.82,1239.29,2.48
7
+ *DiffAb,35.04%,2.53,0.37,489.42,0.37,0.45,40.68%,2.02,1.84,0.19,-1.88,495.69,2.57
8
+ *AbDPO,31.29%,2.79,0.35,116.06,0.38,0.60,69.69%,1.33,4.14,0.10,-1.99,270.12,2.79
9
+ *AbDPO++,36.25%,2.48,0.35,223.73,0.39,0.54,44.51%,2.34,1.66,0.08,-1.78,338.14,2.50
data/co_design.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Model,scTM (L=100) ↑,scRMSD (L=100) ↓,Max Clust. (L=100) ↑,Max TM (L=100) ↓,scTM (L=200) ↑,scRMSD (L=200) ↓,Max Clust. (L=200) ↑,Max TM (L=200) ↓,scTM (L=300) ↑,scRMSD (L=300) ↓,Max Clust. (L=300) ↑,Max TM (L=300) ↓,scTM (L=500) ↑,scRMSD (L=500) ↓,Max Clust. (L=500) ↑,Max TM (L=500) ↓
2
+ Native PDBs,0.91,2.98,0.75,N/A,0.88,3.24,0.77,N/A,0.92,3.94,0.75,N/A,0.90,9.64,0.80,N/A
3
+ ProteinGenerator,0.91,3.75,0.24,0.73,0.88,6.24,0.25,0.72,0.81,9.26,0.22,0.71,0.69,17.00,0.18,0.73
4
+ ProtPardelle*,0.56,12.90,0.57,0.66,0.64,13.67,0.10,0.69,0.69,14.91,0.04,0.72,0.44,43.15,0.60,0.69
5
+ Multiflow,0.96,1.10,0.33,0.71,0.95,1.61,0.42,0.71,0.96,2.14,0.58,0.71,0.95,2.71,0.62,0.71
6
+ ESM3*,0.72,13.80,0.64,0.41,0.63,21.18,0.63,0.61,0.59,25.50,0.52,0.73,0.64,26.72,0.46,0.78
data/conformation_prediction.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Pairwise RMSD,*RMSF,Pearson r on Pairwise RMSD ↑,Pearson r on *Global RMSF ↑,Pearson r on *Per target RMSF ↑,*RMWD ↓,MD PCA W2 ↓,Joint PCA W2 ↓,PC sim > 0.5% ↑,Weak contacts J ↑,Transient contacts J ↑,*Exposed residue J ↑,*Exposed MI matrix ρ ↑,CA break % ↓,CA clash % ↓,PepBond break % ↓
2
+ MD iid,2.76,1.63,0.96,0.97,0.99,0.71,0.76,0.70,93.9,0.90,0.80,0.93,0.56,0.0,0.1,3.4
3
+ MD 2.5 ns,1.54,0.98,0.89,0.85,0.85,2.21,1.57,1.93,36.6,0.62,0.45,0.64,0.24,0.0,0.1,3.4
4
+ EigenFold,5.96,N/A,-0.04,N/A,N/A,N/A,2.35,7.96,12.2,0.36,0.18,N/A,N/A,0.7,9.6,N/A
5
+ MSA-depth256,0.84,0.53,0.25,0.34,0.59,3.63,1.83,2.90,29.3,0.30,0.28,0.33,0.06,0.0,0.2,5.9
6
+ MSA-depth64,2.03,1.51,0.24,0.30,0.57,4.00,1.87,3.32,18.3,0.38,0.27,0.38,0.12,0.0,0.2,8.4
7
+ MSA-depth32,5.71,7.96,0.07,0.17,0.53,6.12,2.50,5.67,17.1,0.39,0.24,0.36,0.15,0.0,0.5,13.0
8
+ Str2Str-ODE (t=0.1),1.66,N/A,0.13,N/A,N/A,N/A,2.12,4.42,6.1,0.42,0.17,N/A,N/A,0.0,0.1,13.7
9
+ Str2Str-ODE (t=0.3),3.15,N/A,0.12,N/A,N/A,N/A,2.23,4.75,9.8,0.41,0.17,N/A,N/A,0.0,0.1,14.8
10
+ Str2Str-SDE (t=0.1),4.74,N/A,0.10,N/A,N/A,N/A,2.54,8.84,9.8,0.40,0.13,N/A,N/A,1.6,0.2,23.0
11
+ Str2Str-SDE (t=0.3),7.54,N/A,0.00,N/A,N/A,N/A,3.29,12.28,7.3,0.35,0.13,N/A,N/A,1.5,0.2,21.4
12
+ AlphaFlow-PDB,2.58,1.20,0.27,0.46,0.81,2.96,1.66,2.60,37.8,0.44,0.33,0.42,0.18,0.0,0.2,6.6
13
+ AlphaFlow-MD,2.88,1.63,0.53,0.66,0.85,2.68,1.53,2.28,39.0,0.57,0.38,0.50,0.24,0.0,0.2,21.7
14
+ ESMFlow-PDB,3.00,1.68,0.14,0.27,0.71,4.20,1.77,3.54,28.0,0.42,0.29,0.41,0.16,0.0,0.6,5.4
15
+ ESMFlow-MD,3.34,2.13,0.19,0.30,0.76,3.63,1.54,3.15,25.6,0.51,0.33,0.47,0.21,0.0,0.3,10.9
16
+ ConfDiff-Open-ClsFree,3.68,2.12,0.40,0.54,0.83,2.92,1.50,2.54,46.3,0.54,0.33,0.47,0.21,0.0,1.2,5.7
17
+ ConfDiff-Open-PDB,2.90,1.43,0.38,0.51,0.82,2.97,1.57,2.51,34.1,0.47,0.34,0.43,0.18,0.0,0.9,5.7
18
+ ConfDiff-Open-MD,3.43,2.21,0.59,0.67,0.85,2.76,1.44,2.25,35.4,0.59,0.36,0.50,0.24,0.0,0.8,6.3
19
+ ConfDiff-ESM-ClsFree,4.04,2.84,0.31,0.43,0.82,3.82,1.72,3.06,37.8,0.54,0.31,0.47,0.18,0.0,1.8,4.3
20
+ ConfDiff-ESM-PDB,3.42,2.06,0.29,0.40,0.80,3.67,1.70,3.17,34.1,0.48,0.31,0.42,0.18,0.0,1.6,3.9
21
+ ConfDiff-ESM-MD,3.91,2.79,0.35,0.48,0.82,3.67,1.66,2.89,39.0,0.56,0.34,0.48,0.23,0.0,1.5,4.0
data/inverse_folding.csv CHANGED
@@ -1,4 +1,4 @@
1
- Model,CASP AAR ↑,CAMEO AAR ↑,length 100 scTM ↑,length 100 pLDDT ↑,length 200 scTM ↑,length 200 pLDDT ↑,length 300 scTM ↑,length 300 pLDDT ↑,length 400 scTM ↑,length 400 pLDDT ↑,length 500 scTM ↑,length 500 pLDDT
2
  ProteinMPNN,0.450,0.468,0.962,94.14,0.945,89.34,0.962,90.28,0.875,83.76,0.568,67.09
3
  ESM-IF1,N/A,N/A,0.810,88.83,0.635,69.67,0.336,74.36,0.449,64.59,0.462,58.97
4
  LM-Design,0.516,0.570,0.834,78.45,0.373,58.41,0.481,69.86,0.565,59.87,0.397,56.35
 
1
+ Model,CASP AAR ↑,CAMEO AAR ↑,scTM (L=100) ↑,pLDDT (L=100) ↑,scTM (L=200) ↑,pLDDT (L=200) ↑,scTM (L=300) ↑,pLDDT (L=300) ↑,scTM (L=400) ↑,pLDDT (L=400) ↑,scTM (L=500) ↑,pLDDT (L=500)
2
  ProteinMPNN,0.450,0.468,0.962,94.14,0.945,89.34,0.962,90.28,0.875,83.76,0.568,67.09
3
  ESM-IF1,N/A,N/A,0.810,88.83,0.635,69.67,0.336,74.36,0.449,64.59,0.462,58.97
4
  LM-Design,0.516,0.570,0.834,78.45,0.373,58.41,0.481,69.86,0.565,59.87,0.397,56.35
data/motif_scaffolding.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Model,1QJG,2KL8,4JHW,4ZYP,5IUS,5TPN,5TRV,5WN9,6EXZ,7MRX,3IXT,1BCF,1PRW,1YCR,5YUI,6E6R
2
+ FrameFlow,15,100,10,30,80,60,25,5,55,17,20,70,10,10,5,46
3
+ RFdiffusion,17,90,13,40,65,50,37,4,57,16,30,80,12,20,8,63
4
+ TDS,25,60,15,20,85,35,34,9,42,22,25,30,15,15,20,25
5
+ EvoDiff,0,0,0,0,0,0,0,0,0,0,9,38,36,3,5,3
6
+ DPLM,0,1,0,1,0,0,0,0,1,0,37,100,81,48,94,79
7
+ ESM3,19,5,0,1,2,7,13,0,56,50,28,100,91,77,89,54
data/multi_state_prediction.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,RMSDens N=10,RMSDens N=100,RMSDens N=500,RMSDens N=1000,RMSD Cluster 3 N=10,RMSD Cluster 3 N=100,RMSD Cluster 3 N=500,RMSD Cluster 3 N=1000,Pairwise RMSD,CA clash (%),CA break (%),PepBond break (%)
2
+ EigenFold,1.56,1.50,1.47,1.46,2.54,2.48,2.46,2.46,0.85,1.4,4.3,N/A
3
+ MSA-depth256,1.57,1.54,1.52,1.52,2.51,2.47,2.45,2.45,0.20,0.0,0.0,9.2
4
+ MSA-depth64,1.60,1.54,1.51,1.50,2.48,2.40,2.35,2.33,0.55,0.0,0.0,7.9
5
+ MSA-depth32,1.67,1.53,1.45,1.41,2.39,2.21,1.93,1.87,2.14,0.6,0.0,10.6
6
+ Str2Str-ODE (Tmax=0.15),2.36,2.19,2.10,2.08,3.03,2.68,2.60,2.56,1.86,0.0,0.0,13.9
7
+ Str2Str-SDE (Tmax=0.15),2.83,2.48,2.28,2.25,3.42,2.92,2.52,2.48,3.60,0.3,0.0,16.0
8
+ AlphaFlow-PDB,1.53,1.45,1.42,1.41,2.48,2.43,2.41,2.40,0.86,0.0,0.0,13.2
9
+ AlphaFlow-MD,1.74,1.51,1.45,1.43,2.44,2.32,2.28,2.24,1.26,0.0,0.1,26.2
10
+ ESMFlow-PDB,1.61,1.49,1.44,1.42,2.47,2.41,2.37,2.35,0.74,0.0,0.0,6.0
11
+ ESMFlow-MD,1.66,1.50,1.41,1.40,2.49,2.29,2.20,2.18,1.17,0.0,0.0,14.3
12
+ ConfDiff-Open-ClsFree,1.65,1.48,1.41,1.37,2.56,2.30,2.16,2.03,1.77,0.5,0.0,5.5
13
+ ConfDiff-Open-MD,1.64,1.50,1.44,1.42,2.49,2.39,2.32,2.31,1.37,0.2,0.0,4.6
14
+ ConfDiff-ESM-ClsFree,1.58,1.45,1.41,1.39,2.50,2.39,2.35,2.33,1.52,0.5,0.0,7.5
15
+ ConfDiff-ESM-MD,1.61,1.47,1.42,1.40,2.45,2.32,2.26,2.24,1.42,0.1,0.0,5.0
16
+ ConfDiff-ESM-Energy,1.63,1.47,1.43,1.42,2.55,2.43,2.41,2.40,1.26,0.1,0.0,7.5
17
+ ConfDiff-ESM-Force,1.58,1.44,1.37,1.36,2.45,2.33,2.23,2.22,1.76,0.1,0.0,8.9
data/protein_folding.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Model,TM-score ↑,RMSD ↓,GDT-TS ↑,IDDT ↑,CA clash (%) ↓,CA break (%) ↓,PepBond break (%) ↓
2
+ AlphaFold2,0.871,3.21,0.860,0.900,0.3,0.0,4.8
3
+ OpenFold,0.870,3.21,0.856,0.895,0.4,0.0,2.0
4
+ RoseTTAFold2,0.859,3.52,0.845,0.888,0.3,0.2,5.5
5
+ ESMFold,0.847,3.98,0.826,0.870,0.3,0.0,4.7
6
+ EigenFold*,0.743,7.65,0.703,0.737,8.0,0.5,N/A
data/sequence_design.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Model,ppl (L=100) ↓,pLDDT (L=100) ↑,pairwise TM (L=100) ↓,Max Clust. (L=100) ↑,Max TM (L=100) ↓,ppl (L=200) ↓,pLDDT (L=200) ↑,pairwise TM (L=200) ↓,Max Clust. (L=200) ↑,Max TM (L=200) ↓,ppl (L=300) ↓,pLDDT (L=300) ↑,pairwise TM (L=300) ↓,Max Clust. (L=300) ↑,Max TM (L=300) ↓,ppl (L=500) ↓,pLDDT (L=500) ↑,pairwise TM (L=500) ↓,Max Clust. (L=500) ↑,Max TM (L=500) ↓
2
+ Native Seqs,N/A,68.46,0.55,0.75,N/A,N/A,61.91,0.49,0.78,N/A,N/A,61.49,0.51,0.85,N/A,N/A,62.95,0.51,0.78,N/A
3
+ Progen 2 (700M),8.28,64.00,0.42,0.94,0.64,5.68,69.91,0.40,0.91,0.69,6.25,65.69,0.42,0.93,0.66,4.27,61.45,0.32,0.95,0.68
4
+ EvoDiff,16.89,50.20,0.43,0.98,0.69,17.28,50.66,0.36,1.00,0.71,17.13,45.14,0.31,1.00,0.68,16.51,43.14,0.31,1.00,0.69
5
+ DPLM (650M),6.21,85.38,0.50,0.80,0.74,4.61,93.54,0.54,0.70,0.91,3.47,93.07,0.57,0.63,0.91,3.33,87.73,0.43,0.85,0.85
6
+ ESM3 (1.4B),14.79,54.26,0.45,0.90,0.68,12.96,58.45,0.35,1.00,0.80,14.59,48.08,0.32,1.00,0.75,11.10,52.17,0.30,1.00,0.54
data/structure_design.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,scTM (L=50) ↑,scRMSD (L=50) ↓,Max TM (L=50) ↓,pairwise TM (L=50) ↓,Max Clust. (L=50) ↑,scTM (L=100) ↑,scRMSD (L=100) ↓,Max TM (L=100) ↓,pairwise TM (L=100) ↓,Max Clust. (L=100) ↑,scTM (L=300) ↑,scRMSD (L=300) ↓,Max TM (L=300) ↓,pairwise TM (L=300) ↓,Max Clust. (L=300) ↑,scTM (L=500) ↑,scRMSD (L=500) ↓,Max TM (L=500) ↓,pairwise TM (L=500) ↓,Max Clust. (L=500) ↑
2
+ Native PDBs,0.91,0.74,N/A,0.29,0.66,0.96,0.67,N/A,0.30,0.77,0.97,0.82,N/A,0.28,0.77,0.97,1.07,N/A,0.29,0.80
3
+ RFdiffusion,0.95,0.45,0.65,0.58,0.67,0.98,0.48,0.76,0.41,0.32,0.96,1.03,0.64,0.36,0.65,0.79,5.60,0.62,0.33,0.89
4
+ FrameFlow,0.91,0.58,0.75,0.68,0.39,0.94,0.70,0.72,0.55,0.49,0.92,1.95,0.65,0.43,0.88,0.61,7.92,0.61,0.40,0.92
5
+ Chroma,0.85,1.05,0.59,0.29,0.48,0.89,1.27,0.70,0.35,0.59,0.87,2.47,0.66,0.36,0.67,0.72,6.71,0.60,0.29,0.99
6
+ FrameDiff(latest),0.85,1.00,0.67,0.35,0.64,0.90,1.23,0.71,0.52,0.11,0.87,2.73,0.69,0.48,0.21,0.63,9.52,0.58,0.40,0.52
7
+ FoldFlow1(sfm),0.90,0.67,0.68,0.63,0.48,0.87,1.34,0.65,0.49,0.83,0.45,9.04,0.54,0.39,1.00,0.37,13.04,0.53,0.37,1.00
8
+ FoldFlow1(base),0.79,1.19,0.66,0.53,0.76,0.81,1.70,0.62,0.48,0.83,0.43,9.56,0.54,0.39,0.98,0.35,13.20,0.52,0.39,1.00
9
+ FoldFlow1(ot),0.83,1.10,0.65,0.53,0.77,0.83,1.60,0.64,0.48,0.81,0.54,8.21,0.58,0.41,0.94,0.37,12.48,0.51,0.35,1.00
10
+ Genie,0.57,3.12,0.57,0.32,0.90,0.69,3.38,0.59,0.31,0.96,0.27,20.37,0.30,0.23,1.00,0.25,26.08,0.22,0.23,1.00
images/pb_logo.png ADDED
src/about.py CHANGED
@@ -1,72 +1,10 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
- @dataclass
5
- class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
9
-
10
-
11
- # Select your tasks here
12
- # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
20
-
21
-
22
-
23
- # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">ProteinBench</h1>"""
25
-
26
- # What does your leaderboard evaluate?
27
- INTRODUCTION_TEXT = """
28
- Intro text
29
- """
30
-
31
- # Which evaluations are you running? how can people reproduce what you have?
32
- LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
34
-
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
-
38
- """
39
-
40
- EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
-
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
-
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
-
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
-
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
60
-
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
-
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
- """
69
-
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
- CITATION_BUTTON_TEXT = r"""
72
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
2
+ CITATION_BUTTON_TEXT = r"""@misc{ye2024proteinbenchholisticevaluationprotein,
3
+ title={ProteinBench: A Holistic Evaluation of Protein Foundation Models},
4
+ author={Fei Ye and Zaixiang Zheng and Dongyu Xue and Yuning Shen and Lihao Wang and Yiming Ma and Yan Wang and Xinyou Wang and Xiangxin Zhou and Quanquan Gu},
5
+ year={2024},
6
+ eprint={2409.06744},
7
+ archivePrefix={arXiv},
8
+ primaryClass={q-bio.QM},
9
+ url={https://arxiv.org/abs/2409.06744},
10
+ }"""
src/display/utils.py DELETED
@@ -1,110 +0,0 @@
1
- from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
-
4
- import pandas as pd
5
-
6
- from src.about import Tasks
7
-
8
- def fields(raw_class):
9
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
-
11
-
12
- # These classes are for user facing column names,
13
- # to avoid having to change them all around the code
14
- # when a modif is needed
15
- @dataclass
16
- class ColumnContent:
17
- name: str
18
- type: str
19
- displayed_by_default: bool
20
- hidden: bool = False
21
- never_hidden: bool = False
22
-
23
- ## Leaderboard columns
24
- auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
-
43
- # We use make dataclass to dynamically fill the scores from Tasks
44
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
-
46
- ## For the queue columns in the submission tab
47
- @dataclass(frozen=True)
48
- class EvalQueueColumn: # Queue column
49
- model = ColumnContent("model", "markdown", True)
50
- revision = ColumnContent("revision", "str", True)
51
- private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
- status = ColumnContent("status", "str", True)
55
-
56
- ## All the model information that we might need
57
- @dataclass
58
- class ModelDetails:
59
- name: str
60
- display_name: str = ""
61
- symbol: str = "" # emoji
62
-
63
-
64
- class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
- Unknown = ModelDetails(name="", symbol="?")
70
-
71
- def to_str(self, separator=" "):
72
- return f"{self.value.symbol}{separator}{self.value.name}"
73
-
74
- @staticmethod
75
- def from_str(type):
76
- if "fine-tuned" in type or "🔶" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟢" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "⭕" in type:
83
- return ModelType.IFT
84
- return ModelType.Unknown
85
-
86
- class WeightType(Enum):
87
- Adapter = ModelDetails("Adapter")
88
- Original = ModelDetails("Original")
89
- Delta = ModelDetails("Delta")
90
-
91
- class Precision(Enum):
92
- float16 = ModelDetails("float16")
93
- bfloat16 = ModelDetails("bfloat16")
94
- Unknown = ModelDetails("?")
95
-
96
- def from_str(precision):
97
- if precision in ["torch.float16", "float16"]:
98
- return Precision.float16
99
- if precision in ["torch.bfloat16", "bfloat16"]:
100
- return Precision.bfloat16
101
- return Precision.Unknown
102
-
103
- # Column selection
104
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
-
106
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
-
109
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/envs.py CHANGED
@@ -2,24 +2,12 @@ import os
2
 
3
  from huggingface_hub import HfApi
4
 
5
- # Info to change for your repository
6
- # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
- # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
- # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
-
19
- # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
-
25
  API = HfApi(token=TOKEN)
 
2
 
3
  from huggingface_hub import HfApi
4
 
5
+ TOKEN = os.environ.get("HF_TOKEN")
 
 
6
 
7
+ OWNER = "proteinbench"
 
8
 
9
  REPO_ID = f"{OWNER}/leaderboard"
10
  QUEUE_REPO = f"{OWNER}/requests"
11
  RESULTS_REPO = f"{OWNER}/results"
12
 
 
 
 
 
 
 
 
 
 
13
  API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py DELETED
@@ -1,196 +0,0 @@
1
- import glob
2
- import json
3
- import math
4
- import os
5
- from dataclasses import dataclass
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
-
14
-
15
- @dataclass
16
- class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
- model: str
23
- revision: str # commit hash, "" if main
24
- results: dict
25
- precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
- license: str = "?"
30
- likes: int = 0
31
- num_params: int = 0
32
- date: str = "" # submission date of request file
33
- still_on_hub: bool = False
34
-
35
- @classmethod
36
- def init_from_json_file(self, json_filepath):
37
- """Inits the result from the specific model result file"""
38
- with open(json_filepath) as fp:
39
- data = json.load(fp)
40
-
41
- config = data.get("config")
42
-
43
- # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
-
46
- # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
-
50
- if len(org_and_model) == 1:
51
- org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
- else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
59
-
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
-
69
- # Extract results available in this file (some results are split in several files)
70
- results = {}
71
- for task in Tasks:
72
- task = task.value
73
-
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
-
82
- return self(
83
- eval_name=result_key,
84
- full_model=full_model,
85
- org=org,
86
- model=model,
87
- results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
91
- architecture=architecture
92
- )
93
-
94
- def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
- try:
99
- with open(request_file, "r") as f:
100
- request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
- self.date = request.get("submitted_time", "")
107
- except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
-
110
- def to_dict(self):
111
- """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
- data_dict = {
114
- "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
- }
128
-
129
- for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
-
132
- return data_dict
133
-
134
-
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
- request_files = os.path.join(
138
- requests_path,
139
- f"{model_name}_eval_request_*.json",
140
- )
141
- request_files = glob.glob(request_files)
142
-
143
- # Select correct request file (precision)
144
- request_file = ""
145
- request_files = sorted(request_files, reverse=True)
146
- for tmp_request_file in request_files:
147
- with open(tmp_request_file, "r") as f:
148
- req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
- request_file = tmp_request_file
154
- return request_file
155
-
156
-
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
- """From the path of the results folder root, extract all needed info for results"""
159
- model_result_filepaths = []
160
-
161
- for root, _, files in os.walk(results_path):
162
- # We should only have json files in model results
163
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
- continue
165
-
166
- # Sort the files by date
167
- try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
-
172
- for file in files:
173
- model_result_filepaths.append(os.path.join(root, file))
174
-
175
- eval_results = {}
176
- for model_result_filepath in model_result_filepaths:
177
- # Creation of result
178
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
-
181
- # Store results of same eval together
182
- eval_name = eval_result.eval_name
183
- if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
- else:
186
- eval_results[eval_name] = eval_result
187
-
188
- results = []
189
- for v in eval_results.values():
190
- try:
191
- v.to_dict() # we test if the dict version is complete
192
- results.append(v)
193
- except KeyError: # not all eval values present
194
- continue
195
-
196
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py DELETED
@@ -1,58 +0,0 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
19
-
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
- return df
23
-
24
-
25
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- """Creates the different dataframes for the evaluation queues requestes"""
27
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
- all_evals = []
29
-
30
- for entry in entries:
31
- if ".json" in entry:
32
- file_path = os.path.join(save_path, entry)
33
- with open(file_path) as fp:
34
- data = json.load(fp)
35
-
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- all_evals.append(data)
40
- elif ".md" not in entry:
41
- # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
51
-
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py DELETED
@@ -1,99 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo
10
- from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import AutoTokenizer
12
-
13
- def check_model_card(repo_id: str) -> tuple[bool, str]:
14
- """Checks if the model card and license exist and have been filled"""
15
- try:
16
- card = ModelCard.load(repo_id)
17
- except huggingface_hub.utils.EntryNotFoundError:
18
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
-
20
- # Enforce license metadata
21
- if card.data.license is None:
22
- if not ("license_name" in card.data and "license_link" in card.data):
23
- return False, (
24
- "License not found. Please add a license to your model card using the `license` metadata or a"
25
- " `license_name`/`license_link` pair."
26
- )
27
-
28
- # Enforce card content
29
- if len(card.text) < 200:
30
- return False, "Please add a description to your model card, it is too short."
31
-
32
- return True, ""
33
-
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
- try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
- if test_tokenizer:
39
- try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- except ValueError as e:
42
- return (
43
- False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
- )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
- return True, None, config
50
-
51
- except ValueError:
52
- return (
53
- False,
54
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
- )
57
-
58
- except Exception as e:
59
- return False, "was not found on hub!", None
60
-
61
-
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
- """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
- try:
65
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
- except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
- model_size = size_factor * model_size
71
- return model_size
72
-
73
- def get_model_arch(model_info: ModelInfo):
74
- """Gets the model architecture from the configuration"""
75
- return model_info.config.get("architectures", "Unknown")
76
-
77
- def already_submitted_models(requested_models_dir: str) -> set[str]:
78
- """Gather a list of already submitted models to avoid duplicates"""
79
- depth = 1
80
- file_names = []
81
- users_to_submission_dates = defaultdict(list)
82
-
83
- for root, _, files in os.walk(requested_models_dir):
84
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
- if current_depth == depth:
86
- for file in files:
87
- if not file.endswith(".json"):
88
- continue
89
- with open(os.path.join(root, file), "r") as f:
90
- info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
-
93
- # Select organisation
94
- if info["model"].count("/") == 0 or "submitted_time" not in info:
95
- continue
96
- organisation, _ = info["model"].split("/")
97
- users_to_submission_dates[organisation].append(info["submitted_time"])
98
-
99
- return set(file_names), users_to_submission_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py DELETED
@@ -1,119 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
-
14
- REQUESTED_MODELS = None
15
- USERS_TO_SUBMISSION_DATES = None
16
-
17
- def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
- ):
25
- global REQUESTED_MODELS
26
- global USERS_TO_SUBMISSION_DATES
27
- if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
-
30
- user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
-
36
- precision = precision.split(" ")[0]
37
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
-
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
- print("Adding new eval")
77
-
78
- eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
84
- "status": "PENDING",
85
- "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
- "private": False,
91
- }
92
-
93
- # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
-
97
- print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
- os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
-
102
- with open(out_path, "w") as f:
103
- f.write(json.dumps(eval_entry))
104
-
105
- print("Uploading eval file")
106
- API.upload_file(
107
- path_or_fileobj=out_path,
108
- path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
- repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
- )
113
-
114
- # Remove the local file
115
- os.remove(out_path)
116
-
117
- return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
- )