Small changes
Browse files- app.py +3 -1
- contamination_report.csv +1 -4
- dataset.py +14 -3
- utils.py +3 -0
app.py
CHANGED
@@ -38,8 +38,10 @@ def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes):
|
|
38 |
| (dataframe["Test Split"] > 0.0)
|
39 |
]
|
40 |
|
|
|
|
|
41 |
return dataframe.style.format(
|
42 |
-
{"Train Split": "{:.1%}", "Development Split": "{:.1%}", "Test Split": "{:.1%}"}
|
43 |
)
|
44 |
|
45 |
|
|
|
38 |
| (dataframe["Test Split"] > 0.0)
|
39 |
]
|
40 |
|
41 |
+
dataframe = dataframe.sort_values("Test Split", ascending=False)
|
42 |
+
|
43 |
return dataframe.style.format(
|
44 |
+
{"Train Split": "{:.1%}", "Development Split": "{:.1%}", "Test Split": "{:.1%}"}, na_rep="Unknown"
|
45 |
)
|
46 |
|
47 |
|
contamination_report.csv
CHANGED
@@ -1,4 +1 @@
|
|
1 |
-
Evaluation Dataset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;
|
2 |
-
conll2003;google/gemma-7b;model;1.0;1.0;1.0;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;
|
3 |
-
conll2003;EleutherAI/the_pile_deduplicated;corpus;1.0;1.0;1.0;data-based;https://aclanthology.org/2023.findings-emnlp.722/;www.google.com
|
4 |
-
Test;lololol;corpus;1.0;1.0;1.0;data-based;https://arxiv.org/abs/2310.03668;
|
|
|
1 |
+
Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
|
|
|
|
|
|
dataset.py
CHANGED
@@ -207,7 +207,7 @@ def get_dataframe():
|
|
207 |
favicon_dict = {}
|
208 |
|
209 |
# Update the favicon dictionary
|
210 |
-
favicon_dict = update_favicon_cache([get_base_url(x) for x in data["
|
211 |
|
212 |
# Update the model url dictionary
|
213 |
model_url_dict = update_model_url_cache(
|
@@ -221,7 +221,7 @@ def get_dataframe():
|
|
221 |
)
|
222 |
|
223 |
# Add favicons URLs to the dataframe in a vectorized manner
|
224 |
-
data["
|
225 |
lambda x: build_text_icon(
|
226 |
text=get_domain_name(x),
|
227 |
url=x,
|
@@ -229,7 +229,7 @@ def get_dataframe():
|
|
229 |
)
|
230 |
)
|
231 |
|
232 |
-
data["PR
|
233 |
lambda x: build_text_icon(
|
234 |
text="",
|
235 |
url=x if x == x else "no link",
|
@@ -245,6 +245,13 @@ def get_dataframe():
|
|
245 |
)
|
246 |
)
|
247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
# For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
|
249 |
data["Contaminated Source"] = data.apply(
|
250 |
lambda x: build_text_icon(
|
@@ -257,4 +264,8 @@ def get_dataframe():
|
|
257 |
axis=1,
|
258 |
)
|
259 |
|
|
|
|
|
|
|
|
|
260 |
return data
|
|
|
207 |
favicon_dict = {}
|
208 |
|
209 |
# Update the favicon dictionary
|
210 |
+
favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]])
|
211 |
|
212 |
# Update the model url dictionary
|
213 |
model_url_dict = update_model_url_cache(
|
|
|
221 |
)
|
222 |
|
223 |
# Add favicons URLs to the dataframe in a vectorized manner
|
224 |
+
data["Reference"] = data["Reference"].apply(
|
225 |
lambda x: build_text_icon(
|
226 |
text=get_domain_name(x),
|
227 |
url=x,
|
|
|
229 |
)
|
230 |
)
|
231 |
|
232 |
+
data["PR"] = data["PR"].apply(
|
233 |
lambda x: build_text_icon(
|
234 |
text="",
|
235 |
url=x if x == x else "no link",
|
|
|
245 |
)
|
246 |
)
|
247 |
|
248 |
+
data["Evaluation Dataset"] = data.apply(
|
249 |
+
lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"],
|
250 |
+
axis=1,
|
251 |
+
)
|
252 |
+
|
253 |
+
del data["Subset"]
|
254 |
+
|
255 |
# For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
|
256 |
data["Contaminated Source"] = data.apply(
|
257 |
lambda x: build_text_icon(
|
|
|
264 |
axis=1,
|
265 |
)
|
266 |
|
267 |
+
data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
|
268 |
+
data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
|
269 |
+
data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x)
|
270 |
+
|
271 |
return data
|
utils.py
CHANGED
@@ -38,6 +38,9 @@ def get_domain_name(url: str) -> str:
|
|
38 |
domain = "{uri.netloc}".format(uri=parsed_uri)
|
39 |
if domain.startswith("www."):
|
40 |
domain = domain[4:]
|
|
|
|
|
|
|
41 |
# First latter in uppercase
|
42 |
return domain.capitalize()
|
43 |
|
|
|
38 |
domain = "{uri.netloc}".format(uri=parsed_uri)
|
39 |
if domain.startswith("www."):
|
40 |
domain = domain[4:]
|
41 |
+
|
42 |
+
# Remove last domain
|
43 |
+
domain = ".".join(domain.split(".")[:-1])
|
44 |
# First latter in uppercase
|
45 |
return domain.capitalize()
|
46 |
|