Spaces:
Runtime error
Runtime error
Muennighoff
commited on
Commit
•
817663f
1
Parent(s):
96fcd80
Add Polish Overall
Browse files
app.py
CHANGED
@@ -57,6 +57,16 @@ TASK_LIST_CLASSIFICATION_NB = [
|
|
57 |
"ScalaNbClassification",
|
58 |
]
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
TASK_LIST_CLASSIFICATION_SV = [
|
61 |
"DalajClassification",
|
62 |
"MassiveIntentClassification (sv)",
|
@@ -102,6 +112,10 @@ TASK_LIST_CLUSTERING_DE = [
|
|
102 |
"TenKGnadClusteringS2S",
|
103 |
]
|
104 |
|
|
|
|
|
|
|
|
|
105 |
TASK_LIST_CLUSTERING_ZH = [
|
106 |
"CLSClusteringP2P",
|
107 |
"CLSClusteringS2S",
|
@@ -115,6 +129,13 @@ TASK_LIST_PAIR_CLASSIFICATION = [
|
|
115 |
"TwitterURLCorpus",
|
116 |
]
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
TASK_LIST_PAIR_CLASSIFICATION_ZH = [
|
119 |
"Cmnli",
|
120 |
"Ocnli",
|
@@ -205,6 +226,12 @@ TASK_LIST_STS = [
|
|
205 |
"STSBenchmark",
|
206 |
]
|
207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
TASK_LIST_STS_ZH = [
|
209 |
"AFQMC",
|
210 |
"ATEC",
|
@@ -222,6 +249,7 @@ TASK_LIST_STS_NORM = [x.replace(" (en)", "").replace(" (en-en)", "") for x in TA
|
|
222 |
TASK_LIST_SUMMARIZATION = ["SummEval",]
|
223 |
|
224 |
TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION
|
|
|
225 |
TASK_LIST_ZH = TASK_LIST_CLASSIFICATION_ZH + TASK_LIST_CLUSTERING_ZH + TASK_LIST_PAIR_CLASSIFICATION_ZH + TASK_LIST_RERANKING_ZH + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_STS_ZH
|
226 |
|
227 |
TASK_TO_METRIC = {
|
@@ -298,6 +326,8 @@ EXTERNAL_MODELS = [
|
|
298 |
"sentence-t5-xl",
|
299 |
"sentence-t5-xxl",
|
300 |
"sup-simcse-bert-base-uncased",
|
|
|
|
|
301 |
"text2vec-base-chinese",
|
302 |
"text2vec-large-chinese",
|
303 |
"text-embedding-ada-002",
|
@@ -371,6 +401,8 @@ EXTERNAL_MODEL_TO_LINK = {
|
|
371 |
"sentence-t5-xl": "https://huggingface.co/sentence-transformers/sentence-t5-xl",
|
372 |
"sentence-t5-xxl": "https://huggingface.co/sentence-transformers/sentence-t5-xxl",
|
373 |
"sup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased",
|
|
|
|
|
374 |
"text2vec-base-chinese": "https://huggingface.co/shibing624/text2vec-base-chinese",
|
375 |
"text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
|
376 |
"text-embedding-ada-002": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
|
@@ -444,6 +476,8 @@ EXTERNAL_MODEL_TO_DIM = {
|
|
444 |
"sentence-t5-xl": 768,
|
445 |
"sentence-t5-xxl": 768,
|
446 |
"sup-simcse-bert-base-uncased": 768,
|
|
|
|
|
447 |
"text2vec-base-chinese": 768,
|
448 |
"text2vec-large-chinese": 1024,
|
449 |
"text-embedding-ada-002": 1536,
|
@@ -517,6 +551,8 @@ EXTERNAL_MODEL_TO_SEQLEN = {
|
|
517 |
"sentence-t5-xl": 512,
|
518 |
"sentence-t5-xxl": 512,
|
519 |
"sup-simcse-bert-base-uncased": 512,
|
|
|
|
|
520 |
"text2vec-base-chinese": 512,
|
521 |
"text2vec-large-chinese": 512,
|
522 |
"text-embedding-ada-002": 8191,
|
@@ -590,6 +626,8 @@ EXTERNAL_MODEL_TO_SIZE = {
|
|
590 |
"sentence-t5-xl": 2.48,
|
591 |
"sentence-t5-xxl": 9.73,
|
592 |
"sup-simcse-bert-base-uncased": 0.44,
|
|
|
|
|
593 |
"text2vec-base-chinese": 0.41,
|
594 |
"text2vec-large-chinese": 1.30,
|
595 |
"unsup-simcse-bert-base-uncased": 0.44,
|
@@ -621,6 +659,7 @@ MODELS_TO_SKIP = {
|
|
621 |
"dmlls/all-mpnet-base-v2",
|
622 |
"cgldo/semanticClone",
|
623 |
"Malmuk1/e5-large-v2_Sharded",
|
|
|
624 |
}
|
625 |
|
626 |
EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
|
@@ -634,17 +673,17 @@ def add_lang(examples):
|
|
634 |
|
635 |
def add_task(examples):
|
636 |
# Could be added to the dataset loading script instead
|
637 |
-
if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM + TASK_LIST_CLASSIFICATION_DA +
|
638 |
examples["mteb_task"] = "Classification"
|
639 |
-
elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE + TASK_LIST_CLUSTERING_ZH:
|
640 |
examples["mteb_task"] = "Clustering"
|
641 |
-
elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_PAIR_CLASSIFICATION_ZH:
|
642 |
examples["mteb_task"] = "PairClassification"
|
643 |
elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING + TASK_LIST_RERANKING_ZH:
|
644 |
examples["mteb_task"] = "Reranking"
|
645 |
elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM + TASK_LIST_RETRIEVAL_PL + TASK_LIST_RETRIEVAL_ZH:
|
646 |
examples["mteb_task"] = "Retrieval"
|
647 |
-
elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM + TASK_LIST_STS_ZH:
|
648 |
examples["mteb_task"] = "STS"
|
649 |
elif examples["mteb_dataset_name"] in TASK_LIST_SUMMARIZATION:
|
650 |
examples["mteb_task"] = "Summarization"
|
@@ -915,7 +954,62 @@ def get_mteb_average_zh():
|
|
915 |
|
916 |
return DATA_OVERALL_ZH
|
917 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
918 |
get_mteb_average()
|
|
|
919 |
get_mteb_average_zh()
|
920 |
DATA_BITEXT_MINING = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING)
|
921 |
DATA_BITEXT_MINING_OTHER = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING_OTHER)
|
@@ -924,7 +1018,6 @@ DATA_CLASSIFICATION_NB = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIF
|
|
924 |
DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
|
925 |
DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
|
926 |
DATA_CLUSTERING_DE = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
|
927 |
-
DATA_RETRIEVAL_PL = get_mteb_data(["Retrieval"], [], TASK_LIST_RETRIEVAL_PL)
|
928 |
DATA_STS_OTHER = get_mteb_data(["STS"], [], TASK_LIST_STS_OTHER)
|
929 |
|
930 |
# Exact, add all non-nan integer values for every dataset
|
@@ -938,19 +1031,24 @@ for d in [
|
|
938 |
DATA_CLASSIFICATION_EN,
|
939 |
DATA_CLASSIFICATION_DA,
|
940 |
DATA_CLASSIFICATION_NB,
|
|
|
941 |
DATA_CLASSIFICATION_SV,
|
942 |
DATA_CLASSIFICATION_ZH,
|
943 |
DATA_CLASSIFICATION_OTHER,
|
944 |
DATA_CLUSTERING,
|
945 |
DATA_CLUSTERING_DE,
|
|
|
946 |
DATA_CLUSTERING_ZH,
|
947 |
DATA_PAIR_CLASSIFICATION,
|
|
|
948 |
DATA_PAIR_CLASSIFICATION_ZH,
|
949 |
DATA_RERANKING,
|
950 |
DATA_RERANKING_ZH,
|
951 |
DATA_RETRIEVAL,
|
|
|
952 |
DATA_RETRIEVAL_ZH,
|
953 |
DATA_STS_EN,
|
|
|
954 |
DATA_STS_ZH,
|
955 |
DATA_STS_OTHER,
|
956 |
DATA_SUMMARIZATION,
|
@@ -1017,6 +1115,25 @@ with block:
|
|
1017 |
with gr.Row():
|
1018 |
data_run_overall_zh = gr.Button("Refresh")
|
1019 |
data_run_overall_zh.click(get_mteb_average_zh, inputs=None, outputs=data_overall_zh)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1020 |
with gr.TabItem("Bitext Mining"):
|
1021 |
with gr.TabItem("English-X"):
|
1022 |
with gr.Row():
|
@@ -1184,7 +1301,36 @@ with block:
|
|
1184 |
datasets_classification_nb,
|
1185 |
],
|
1186 |
outputs=data_classification_nb,
|
1187 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1188 |
with gr.TabItem("Swedish"):
|
1189 |
with gr.Row():
|
1190 |
gr.Markdown("""
|
@@ -1316,7 +1462,32 @@ with block:
|
|
1316 |
get_mteb_data,
|
1317 |
inputs=[task_clustering_de, lang_clustering_de, datasets_clustering_de],
|
1318 |
outputs=data_clustering_de,
|
1319 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1320 |
with gr.TabItem("Pair Classification"):
|
1321 |
with gr.TabItem("English"):
|
1322 |
with gr.Row():
|
@@ -1375,6 +1546,35 @@ with block:
|
|
1375 |
],
|
1376 |
outputs=data_pair_classification_zh,
|
1377 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1378 |
with gr.TabItem("Reranking"):
|
1379 |
with gr.TabItem("English"):
|
1380 |
with gr.Row():
|
@@ -1561,6 +1761,31 @@ with block:
|
|
1561 |
inputs=[task_sts_zh, lang_sts_zh, datasets_sts_zh],
|
1562 |
outputs=data_sts_zh,
|
1563 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1564 |
with gr.TabItem("Other"):
|
1565 |
with gr.Row():
|
1566 |
gr.Markdown("""
|
@@ -1627,16 +1852,6 @@ with block:
|
|
1627 |
# This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
|
1628 |
"""
|
1629 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
1630 |
-
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
1631 |
-
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
1632 |
-
block.load(get_mteb_data, inputs=[task_clustering, empty, datasets_clustering], outputs=data_clustering)
|
1633 |
-
block.load(get_mteb_data, inputs=[task_clustering_de, empty_de, datasets_clustering_de], outputs=data_clustering_de)
|
1634 |
-
block.load(get_mteb_data, inputs=[task_pair_classification], outputs=data_pair_classification)
|
1635 |
-
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
|
1636 |
-
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
|
1637 |
-
block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en)
|
1638 |
-
block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
1639 |
-
block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
|
1640 |
"""
|
1641 |
|
1642 |
block.queue(concurrency_count=40, max_size=10)
|
|
|
57 |
"ScalaNbClassification",
|
58 |
]
|
59 |
|
60 |
+
TASK_LIST_CLASSIFICATION_PL = [
|
61 |
+
"AbusiveClauses",
|
62 |
+
"AllegroReviews",
|
63 |
+
"CBD",
|
64 |
+
"MassiveIntentClassification (pl)",
|
65 |
+
"MassiveScenarioClassification (pl)",
|
66 |
+
"PolEmo2.0-IN",
|
67 |
+
"PolEmo2.0-OUT",
|
68 |
+
]
|
69 |
+
|
70 |
TASK_LIST_CLASSIFICATION_SV = [
|
71 |
"DalajClassification",
|
72 |
"MassiveIntentClassification (sv)",
|
|
|
112 |
"TenKGnadClusteringS2S",
|
113 |
]
|
114 |
|
115 |
+
TASK_LIST_CLUSTERING_PL = [
|
116 |
+
"8TagsClustering",
|
117 |
+
]
|
118 |
+
|
119 |
TASK_LIST_CLUSTERING_ZH = [
|
120 |
"CLSClusteringP2P",
|
121 |
"CLSClusteringS2S",
|
|
|
129 |
"TwitterURLCorpus",
|
130 |
]
|
131 |
|
132 |
+
TASK_LIST_PAIR_CLASSIFICATION_PL = [
|
133 |
+
"CDSC-E",
|
134 |
+
"PPC",
|
135 |
+
"PSC",
|
136 |
+
"SICK-E-PL",
|
137 |
+
]
|
138 |
+
|
139 |
TASK_LIST_PAIR_CLASSIFICATION_ZH = [
|
140 |
"Cmnli",
|
141 |
"Ocnli",
|
|
|
226 |
"STSBenchmark",
|
227 |
]
|
228 |
|
229 |
+
TASK_LIST_STS_PL = [
|
230 |
+
"CDSC-R",
|
231 |
+
"SICK-R-PL",
|
232 |
+
"STS22 (pl)",
|
233 |
+
]
|
234 |
+
|
235 |
TASK_LIST_STS_ZH = [
|
236 |
"AFQMC",
|
237 |
"ATEC",
|
|
|
249 |
TASK_LIST_SUMMARIZATION = ["SummEval",]
|
250 |
|
251 |
TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION
|
252 |
+
TASK_LIST_PL = TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLUSTERING_PL + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_RETRIEVAL_PL + TASK_LIST_STS_PL
|
253 |
TASK_LIST_ZH = TASK_LIST_CLASSIFICATION_ZH + TASK_LIST_CLUSTERING_ZH + TASK_LIST_PAIR_CLASSIFICATION_ZH + TASK_LIST_RERANKING_ZH + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_STS_ZH
|
254 |
|
255 |
TASK_TO_METRIC = {
|
|
|
326 |
"sentence-t5-xl",
|
327 |
"sentence-t5-xxl",
|
328 |
"sup-simcse-bert-base-uncased",
|
329 |
+
"st-polish-paraphrase-from-distilroberta",
|
330 |
+
"st-polish-paraphrase-from-mpnet",
|
331 |
"text2vec-base-chinese",
|
332 |
"text2vec-large-chinese",
|
333 |
"text-embedding-ada-002",
|
|
|
401 |
"sentence-t5-xl": "https://huggingface.co/sentence-transformers/sentence-t5-xl",
|
402 |
"sentence-t5-xxl": "https://huggingface.co/sentence-transformers/sentence-t5-xxl",
|
403 |
"sup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased",
|
404 |
+
"st-polish-paraphrase-from-distilroberta": "https://huggingface.co/sdadas/st-polish-paraphrase-from-distilroberta",
|
405 |
+
"st-polish-paraphrase-from-mpnet": "https://huggingface.co/sdadas/st-polish-paraphrase-from-mpnet",
|
406 |
"text2vec-base-chinese": "https://huggingface.co/shibing624/text2vec-base-chinese",
|
407 |
"text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
|
408 |
"text-embedding-ada-002": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
|
|
|
476 |
"sentence-t5-xl": 768,
|
477 |
"sentence-t5-xxl": 768,
|
478 |
"sup-simcse-bert-base-uncased": 768,
|
479 |
+
"st-polish-paraphrase-from-distilroberta": 768,
|
480 |
+
"st-polish-paraphrase-from-mpnet": 768,
|
481 |
"text2vec-base-chinese": 768,
|
482 |
"text2vec-large-chinese": 1024,
|
483 |
"text-embedding-ada-002": 1536,
|
|
|
551 |
"sentence-t5-xl": 512,
|
552 |
"sentence-t5-xxl": 512,
|
553 |
"sup-simcse-bert-base-uncased": 512,
|
554 |
+
"st-polish-paraphrase-from-distilroberta": 514,
|
555 |
+
"st-polish-paraphrase-from-mpnet": 514,
|
556 |
"text2vec-base-chinese": 512,
|
557 |
"text2vec-large-chinese": 512,
|
558 |
"text-embedding-ada-002": 8191,
|
|
|
626 |
"sentence-t5-xl": 2.48,
|
627 |
"sentence-t5-xxl": 9.73,
|
628 |
"sup-simcse-bert-base-uncased": 0.44,
|
629 |
+
"st-polish-paraphrase-from-distilroberta": 0.50,
|
630 |
+
"st-polish-paraphrase-from-mpnet": 0.50,
|
631 |
"text2vec-base-chinese": 0.41,
|
632 |
"text2vec-large-chinese": 1.30,
|
633 |
"unsup-simcse-bert-base-uncased": 0.44,
|
|
|
659 |
"dmlls/all-mpnet-base-v2",
|
660 |
"cgldo/semanticClone",
|
661 |
"Malmuk1/e5-large-v2_Sharded",
|
662 |
+
"jncraton/gte-small-ct2-int8",
|
663 |
}
|
664 |
|
665 |
EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
|
|
|
673 |
|
674 |
def add_task(examples):
|
675 |
# Could be added to the dataset loading script instead
|
676 |
+
if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM + TASK_LIST_CLASSIFICATION_DA + TASK_LIST_CLASSIFICATION_NB + TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLASSIFICATION_SV + TASK_LIST_CLASSIFICATION_ZH:
|
677 |
examples["mteb_task"] = "Classification"
|
678 |
+
elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE + TASK_LIST_CLUSTERING_PL + TASK_LIST_CLUSTERING_ZH:
|
679 |
examples["mteb_task"] = "Clustering"
|
680 |
+
elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_PAIR_CLASSIFICATION_ZH:
|
681 |
examples["mteb_task"] = "PairClassification"
|
682 |
elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING + TASK_LIST_RERANKING_ZH:
|
683 |
examples["mteb_task"] = "Reranking"
|
684 |
elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM + TASK_LIST_RETRIEVAL_PL + TASK_LIST_RETRIEVAL_ZH:
|
685 |
examples["mteb_task"] = "Retrieval"
|
686 |
+
elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM + TASK_LIST_STS_PL + TASK_LIST_STS_ZH:
|
687 |
examples["mteb_task"] = "STS"
|
688 |
elif examples["mteb_dataset_name"] in TASK_LIST_SUMMARIZATION:
|
689 |
examples["mteb_task"] = "Summarization"
|
|
|
954 |
|
955 |
return DATA_OVERALL_ZH
|
956 |
|
957 |
+
def get_mteb_average_pl():
|
958 |
+
global DATA_OVERALL_PL, DATA_CLASSIFICATION_PL, DATA_CLUSTERING_PL, DATA_PAIR_CLASSIFICATION_PL, DATA_RETRIEVAL_PL, DATA_STS_PL
|
959 |
+
DATA_OVERALL_PL = get_mteb_data(
|
960 |
+
tasks=[
|
961 |
+
"Classification",
|
962 |
+
"Clustering",
|
963 |
+
"PairClassification",
|
964 |
+
"Retrieval",
|
965 |
+
"STS",
|
966 |
+
],
|
967 |
+
datasets=TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLUSTERING_PL + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_RETRIEVAL_PL + TASK_LIST_STS_PL,
|
968 |
+
fillna=False,
|
969 |
+
add_emb_dim=True,
|
970 |
+
rank=False,
|
971 |
+
)
|
972 |
+
# Debugging:
|
973 |
+
# DATA_OVERALL_PL.to_csv("overall.csv")
|
974 |
+
|
975 |
+
DATA_OVERALL_PL.insert(1, f"Average ({len(TASK_LIST_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_PL].mean(axis=1, skipna=False))
|
976 |
+
DATA_OVERALL_PL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_CLASSIFICATION_PL].mean(axis=1, skipna=False))
|
977 |
+
DATA_OVERALL_PL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_CLUSTERING_PL].mean(axis=1, skipna=False))
|
978 |
+
DATA_OVERALL_PL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_PAIR_CLASSIFICATION_PL].mean(axis=1, skipna=False))
|
979 |
+
DATA_OVERALL_PL.insert(5, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_RETRIEVAL_PL].mean(axis=1, skipna=False))
|
980 |
+
DATA_OVERALL_PL.insert(6, f"STS Average ({len(TASK_LIST_STS_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_STS_PL].mean(axis=1, skipna=False))
|
981 |
+
DATA_OVERALL_PL.sort_values(f"Average ({len(TASK_LIST_PL)} datasets)", ascending=False, inplace=True)
|
982 |
+
# Start ranking from 1
|
983 |
+
DATA_OVERALL_PL.insert(0, "Rank", list(range(1, len(DATA_OVERALL_PL) + 1)))
|
984 |
+
|
985 |
+
DATA_OVERALL_PL = DATA_OVERALL_PL.round(2)
|
986 |
+
|
987 |
+
DATA_CLASSIFICATION_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_CLASSIFICATION_PL])
|
988 |
+
# Only keep rows with at least one score in addition to the "Model" & rank column
|
989 |
+
DATA_CLASSIFICATION_PL = DATA_CLASSIFICATION_PL[DATA_CLASSIFICATION_PL.iloc[:, 2:].ne("").any(axis=1)]
|
990 |
+
|
991 |
+
DATA_CLUSTERING_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_CLUSTERING_PL])
|
992 |
+
DATA_CLUSTERING_PL = DATA_CLUSTERING_PL[DATA_CLUSTERING_PL.iloc[:, 2:].ne("").any(axis=1)]
|
993 |
+
|
994 |
+
DATA_PAIR_CLASSIFICATION_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION_PL])
|
995 |
+
DATA_PAIR_CLASSIFICATION_PL = DATA_PAIR_CLASSIFICATION_PL[DATA_PAIR_CLASSIFICATION_PL.iloc[:, 2:].ne("").any(axis=1)]
|
996 |
+
|
997 |
+
DATA_RETRIEVAL_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_RETRIEVAL_PL])
|
998 |
+
DATA_RETRIEVAL_PL = DATA_RETRIEVAL_PL[DATA_RETRIEVAL_PL.iloc[:, 2:].ne("").any(axis=1)]
|
999 |
+
|
1000 |
+
DATA_STS_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_STS_PL])
|
1001 |
+
DATA_STS_PL = DATA_STS_PL[DATA_STS_PL.iloc[:, 2:].ne("").any(axis=1)]
|
1002 |
+
|
1003 |
+
# Fill NaN after averaging
|
1004 |
+
DATA_OVERALL_PL.fillna("", inplace=True)
|
1005 |
+
|
1006 |
+
DATA_OVERALL_PL = DATA_OVERALL_PL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_PL)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", f"STS Average ({len(TASK_LIST_STS_PL)} datasets)"]]
|
1007 |
+
DATA_OVERALL_PL = DATA_OVERALL_PL[DATA_OVERALL_PL.iloc[:, 5:].ne("").any(axis=1)]
|
1008 |
+
|
1009 |
+
return DATA_OVERALL_PL
|
1010 |
+
|
1011 |
get_mteb_average()
|
1012 |
+
get_mteb_average_pl()
|
1013 |
get_mteb_average_zh()
|
1014 |
DATA_BITEXT_MINING = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING)
|
1015 |
DATA_BITEXT_MINING_OTHER = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING_OTHER)
|
|
|
1018 |
DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
|
1019 |
DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
|
1020 |
DATA_CLUSTERING_DE = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
|
|
|
1021 |
DATA_STS_OTHER = get_mteb_data(["STS"], [], TASK_LIST_STS_OTHER)
|
1022 |
|
1023 |
# Exact, add all non-nan integer values for every dataset
|
|
|
1031 |
DATA_CLASSIFICATION_EN,
|
1032 |
DATA_CLASSIFICATION_DA,
|
1033 |
DATA_CLASSIFICATION_NB,
|
1034 |
+
DATA_CLASSIFICATION_PL,
|
1035 |
DATA_CLASSIFICATION_SV,
|
1036 |
DATA_CLASSIFICATION_ZH,
|
1037 |
DATA_CLASSIFICATION_OTHER,
|
1038 |
DATA_CLUSTERING,
|
1039 |
DATA_CLUSTERING_DE,
|
1040 |
+
DATA_CLUSTERING_PL,
|
1041 |
DATA_CLUSTERING_ZH,
|
1042 |
DATA_PAIR_CLASSIFICATION,
|
1043 |
+
DATA_PAIR_CLASSIFICATION_PL,
|
1044 |
DATA_PAIR_CLASSIFICATION_ZH,
|
1045 |
DATA_RERANKING,
|
1046 |
DATA_RERANKING_ZH,
|
1047 |
DATA_RETRIEVAL,
|
1048 |
+
DATA_RETRIEVAL_PL,
|
1049 |
DATA_RETRIEVAL_ZH,
|
1050 |
DATA_STS_EN,
|
1051 |
+
DATA_STS_PL,
|
1052 |
DATA_STS_ZH,
|
1053 |
DATA_STS_OTHER,
|
1054 |
DATA_SUMMARIZATION,
|
|
|
1115 |
with gr.Row():
|
1116 |
data_run_overall_zh = gr.Button("Refresh")
|
1117 |
data_run_overall_zh.click(get_mteb_average_zh, inputs=None, outputs=data_overall_zh)
|
1118 |
+
with gr.TabItem("Polish"):
|
1119 |
+
with gr.Row():
|
1120 |
+
gr.Markdown("""
|
1121 |
+
**Overall MTEB Polish leaderboard (PL-MTEB) 🔮🇵🇱**
|
1122 |
+
|
1123 |
+
- **Metric:** Various, refer to task tabs
|
1124 |
+
- **Languages:** Polish
|
1125 |
+
- **Credits:** [Rafał Poświata](https://github.com/rafalposwiata), [Konrad Wojtasik](https://github.com/kwojtasi) & [BEIR-PL](https://arxiv.org/abs/2305.19840)
|
1126 |
+
""")
|
1127 |
+
with gr.Row():
|
1128 |
+
data_overall_pl = gr.components.Dataframe(
|
1129 |
+
DATA_OVERALL_PL,
|
1130 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL_PL.columns),
|
1131 |
+
type="pandas",
|
1132 |
+
wrap=True,
|
1133 |
+
)
|
1134 |
+
with gr.Row():
|
1135 |
+
data_run_overall_pl = gr.Button("Refresh")
|
1136 |
+
data_run_overall_pl.click(get_mteb_average_pl, inputs=None, outputs=data_overall_pl)
|
1137 |
with gr.TabItem("Bitext Mining"):
|
1138 |
with gr.TabItem("English-X"):
|
1139 |
with gr.Row():
|
|
|
1301 |
datasets_classification_nb,
|
1302 |
],
|
1303 |
outputs=data_classification_nb,
|
1304 |
+
)
|
1305 |
+
with gr.TabItem("Polish"):
|
1306 |
+
with gr.Row():
|
1307 |
+
gr.Markdown("""
|
1308 |
+
**Classification Polish Leaderboard 🤍🇵🇱**
|
1309 |
+
|
1310 |
+
- **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
|
1311 |
+
- **Languages:** Polish
|
1312 |
+
- **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
|
1313 |
+
""")
|
1314 |
+
with gr.Row():
|
1315 |
+
data_classification_pl = gr.components.Dataframe(
|
1316 |
+
DATA_CLASSIFICATION_PL,
|
1317 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_PL.columns),
|
1318 |
+
type="pandas",
|
1319 |
+
)
|
1320 |
+
with gr.Row():
|
1321 |
+
data_run_classification_pl = gr.Button("Refresh")
|
1322 |
+
task_classification_pl = gr.Variable(value=["Classification"])
|
1323 |
+
lang_classification_pl = gr.Variable(value=[])
|
1324 |
+
datasets_classification_pl = gr.Variable(value=TASK_LIST_CLASSIFICATION_PL)
|
1325 |
+
data_run_classification_pl.click(
|
1326 |
+
get_mteb_data,
|
1327 |
+
inputs=[
|
1328 |
+
task_classification_pl,
|
1329 |
+
lang_classification_pl,
|
1330 |
+
datasets_classification_pl,
|
1331 |
+
],
|
1332 |
+
outputs=data_classification_pl,
|
1333 |
+
)
|
1334 |
with gr.TabItem("Swedish"):
|
1335 |
with gr.Row():
|
1336 |
gr.Markdown("""
|
|
|
1462 |
get_mteb_data,
|
1463 |
inputs=[task_clustering_de, lang_clustering_de, datasets_clustering_de],
|
1464 |
outputs=data_clustering_de,
|
1465 |
+
)
|
1466 |
+
with gr.TabItem("Polish"):
|
1467 |
+
with gr.Row():
|
1468 |
+
gr.Markdown("""
|
1469 |
+
**Clustering Polish Leaderboard ✨🇵🇱**
|
1470 |
+
|
1471 |
+
- **Metric:** Validity Measure (v_measure)
|
1472 |
+
- **Languages:** Polish
|
1473 |
+
- **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
|
1474 |
+
""")
|
1475 |
+
with gr.Row():
|
1476 |
+
data_clustering_pl = gr.components.Dataframe(
|
1477 |
+
DATA_CLUSTERING_PL,
|
1478 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_PL.columns) * 2,
|
1479 |
+
type="pandas",
|
1480 |
+
)
|
1481 |
+
with gr.Row():
|
1482 |
+
data_run_clustering_pl = gr.Button("Refresh")
|
1483 |
+
task_clustering_pl = gr.Variable(value=["Clustering"])
|
1484 |
+
lang_clustering_pl = gr.Variable(value=[])
|
1485 |
+
datasets_clustering_pl = gr.Variable(value=TASK_LIST_CLUSTERING_PL)
|
1486 |
+
data_run_clustering_pl.click(
|
1487 |
+
get_mteb_data,
|
1488 |
+
inputs=[task_clustering_pl, lang_clustering_pl, datasets_clustering_pl],
|
1489 |
+
outputs=data_clustering_pl,
|
1490 |
+
)
|
1491 |
with gr.TabItem("Pair Classification"):
|
1492 |
with gr.TabItem("English"):
|
1493 |
with gr.Row():
|
|
|
1546 |
],
|
1547 |
outputs=data_pair_classification_zh,
|
1548 |
)
|
1549 |
+
with gr.TabItem("Polish"):
|
1550 |
+
with gr.Row():
|
1551 |
+
gr.Markdown("""
|
1552 |
+
**Pair Classification Chinese Leaderboard 🎭🇵🇱**
|
1553 |
+
|
1554 |
+
- **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
|
1555 |
+
- **Languages:** Polish
|
1556 |
+
- **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
|
1557 |
+
""")
|
1558 |
+
with gr.Row():
|
1559 |
+
data_pair_classification_pl = gr.components.Dataframe(
|
1560 |
+
DATA_PAIR_CLASSIFICATION_PL,
|
1561 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION_PL.columns),
|
1562 |
+
type="pandas",
|
1563 |
+
)
|
1564 |
+
with gr.Row():
|
1565 |
+
data_run = gr.Button("Refresh")
|
1566 |
+
task_pair_classification_pl = gr.Variable(value=["PairClassification"])
|
1567 |
+
lang_pair_classification_pl = gr.Variable(value=[])
|
1568 |
+
datasets_pair_classification_pl = gr.Variable(value=TASK_LIST_PAIR_CLASSIFICATION_PL)
|
1569 |
+
data_run_classification_pl.click(
|
1570 |
+
get_mteb_data,
|
1571 |
+
inputs=[
|
1572 |
+
task_pair_classification_pl,
|
1573 |
+
lang_pair_classification_pl,
|
1574 |
+
datasets_pair_classification_pl,
|
1575 |
+
],
|
1576 |
+
outputs=data_pair_classification_pl,
|
1577 |
+
)
|
1578 |
with gr.TabItem("Reranking"):
|
1579 |
with gr.TabItem("English"):
|
1580 |
with gr.Row():
|
|
|
1761 |
inputs=[task_sts_zh, lang_sts_zh, datasets_sts_zh],
|
1762 |
outputs=data_sts_zh,
|
1763 |
)
|
1764 |
+
with gr.TabItem("Polish"):
|
1765 |
+
with gr.Row():
|
1766 |
+
gr.Markdown("""
|
1767 |
+
**STS Polish Leaderboard 🤖🇵🇱**
|
1768 |
+
|
1769 |
+
- **Metric:** Spearman correlation based on cosine similarity
|
1770 |
+
- **Languages:** Polish
|
1771 |
+
- **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
|
1772 |
+
""")
|
1773 |
+
with gr.Row():
|
1774 |
+
data_sts_pl = gr.components.Dataframe(
|
1775 |
+
DATA_STS_PL,
|
1776 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_STS_PL.columns),
|
1777 |
+
type="pandas",
|
1778 |
+
)
|
1779 |
+
with gr.Row():
|
1780 |
+
data_run_sts_pl = gr.Button("Refresh")
|
1781 |
+
task_sts_pl = gr.Variable(value=["STS"])
|
1782 |
+
lang_sts_pl = gr.Variable(value=[])
|
1783 |
+
datasets_sts_pl = gr.Variable(value=TASK_LIST_STS_PL)
|
1784 |
+
data_run_sts_pl.click(
|
1785 |
+
get_mteb_data,
|
1786 |
+
inputs=[task_sts_pl, lang_sts_pl, datasets_sts_pl],
|
1787 |
+
outputs=data_sts_pl,
|
1788 |
+
)
|
1789 |
with gr.TabItem("Other"):
|
1790 |
with gr.Row():
|
1791 |
gr.Markdown("""
|
|
|
1852 |
# This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
|
1853 |
"""
|
1854 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1855 |
"""
|
1856 |
|
1857 |
block.queue(concurrency_count=40, max_size=10)
|