Superglue/RealNews Contamination based on "Noise-Robust De-Duplication at Scale"
#15
by
emilys
- opened
- contamination_report.csv +46 -0
contamination_report.csv
CHANGED
@@ -3,6 +3,8 @@ Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Develo
|
|
3 |
gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
|
4 |
ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
5 |
openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
|
|
|
|
|
6 |
imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
|
7 |
imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
|
8 |
ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8
|
@@ -477,6 +479,38 @@ EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/abs/23
|
|
477 |
bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
478 |
bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
479 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
481 |
RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
482 |
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
@@ -597,3 +631,15 @@ ibragim-bad/arc_challenge;;FLAN;model;;15.6;;data-based;https://arxiv.org/abs/21
|
|
597 |
facebook/anli;dev_r3;FLAN;model;;40.2;;data-based;https://arxiv.org/abs/2109.01652;13
|
598 |
facebook/anli;dev_r2;FLAN;model;;97.9;;data-based;https://arxiv.org/abs/2109.01652;13
|
599 |
facebook/anli;dev_r1;FLAN;model;;98.6;;data-based;https://arxiv.org/abs/2109.01652;13
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
|
4 |
ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
5 |
openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
|
6 |
+
openai_humaneval;;GPT-3.5-turbo/0613;model;;;23.79;model-based;https://arxiv.org/abs/2402.15938;16
|
7 |
+
openai_humaneval;;GPT-3.5-turbo/1106;model;;;41.47;model-based;https://arxiv.org/abs/2402.15938;16
|
8 |
imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
|
9 |
imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
|
10 |
ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8
|
|
|
479 |
bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
480 |
bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
481 |
|
482 |
+
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
|
483 |
+
RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
|
484 |
+
|
485 |
+
|
486 |
+
openai_humaneval;;GPT-4;model;;;25.0;data-based;https://arxiv.org/abs/2303.08774;11
|
487 |
+
ucinlp/drop;;GPT-4;model;;21.0;;data-based;https://arxiv.org/abs/2303.08774;11
|
488 |
+
bigbench;;GPT-4;model;;;100.0;data-based;https://arxiv.org/abs/2303.08774;11
|
489 |
+
gsm8k;;GPT-4;model;100.0;;1.0;data-based;https://arxiv.org/abs/2303.08774;11
|
490 |
+
EleutherAI/hendrycks_math;;GPT-4;model;100.0;;;data-based;https://arxiv.org/abs/2303.08774;11
|
491 |
+
cais/mmlu;;GPT-4;model;;;0.6;data-based;https://arxiv.org/abs/2303.08774;11
|
492 |
+
ibragim-bad/arc_challenge;;GPT-4;model;;;3.4;data-based;https://arxiv.org/abs/2303.08774;11
|
493 |
+
winogrande;;GPT-4;model;;;0.9;data-based;https://arxiv.org/abs/2303.08774;11
|
494 |
+
cais/mmlu;;GPT-3.5;model;;;52.0;model-based;https://arxiv.org/abs/2311.09783;10
|
495 |
+
winogrande;;GPT-3.5;model;;;9.0;model-based;https://arxiv.org/abs/2311.09783;10
|
496 |
+
truthful_qa;;GPT-3.5;model;;;12.0;model-based;https://arxiv.org/abs/2311.09783;10
|
497 |
+
allenai/openbookqa;;GPT-3.5;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
|
498 |
+
|
499 |
+
cais/mmlu;;GPT-4;model;;;57.0;model-based;https://arxiv.org/abs/2311.09783;10
|
500 |
+
truthful_qa;;GPT-4;model;;;10.0;model-based;https://arxiv.org/abs/2311.09783;10
|
501 |
+
winogrande;;GPT-4;model;;;12.0;model-based;https://arxiv.org/abs/2311.09783;10
|
502 |
+
allenai/openbookqa;;GPT-4;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
|
503 |
+
Rowan/hellaswag;;GPT-4;model;;;2.0;model-based;https://arxiv.org/abs/2311.09783;10
|
504 |
+
|
505 |
+
|
506 |
+
allenai/openbookqa;;LLaMa 2-13B;model;;;4.0;model-based;https://arxiv.org/abs/2311.09783;10
|
507 |
+
truthful_qa;;LLaMa 2-13B;model;;;2.0;model-based;https://arxiv.org/abs/2311.09783;10
|
508 |
+
winogrande;;LLaMa 2-13B;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
|
509 |
+
|
510 |
+
truthful_qa;;Mistral-7B;model;;;15.0;model-based;https://arxiv.org/abs/2311.09783;10
|
511 |
+
allenai/openbookqa;;Mistral-7B;model;;;10.0;model-based;https://arxiv.org/abs/2311.09783;10
|
512 |
+
winogrande;;Mistral-7B;model;;;3.0;model-based;https://arxiv.org/abs/2311.09783;10
|
513 |
+
cais/mmlu;;Mistral-7B;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
|
514 |
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
515 |
RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
516 |
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
|
|
631 |
facebook/anli;dev_r3;FLAN;model;;40.2;;data-based;https://arxiv.org/abs/2109.01652;13
|
632 |
facebook/anli;dev_r2;FLAN;model;;97.9;;data-based;https://arxiv.org/abs/2109.01652;13
|
633 |
facebook/anli;dev_r1;FLAN;model;;98.6;;data-based;https://arxiv.org/abs/2109.01652;13
|
634 |
+
|
635 |
+
ibragim-bad/arc_easy;;mistralai/Mistral-7B-v0.1;model;;;100.0;model-based;https://arxiv.org/abs/2310.17623;14
|
636 |
+
|
637 |
+
super_glue;boolq;allenai/c4 (realnewslike);corpus;;;0.6;data-based;https://arxiv.org/abs/2210.04261;15
|
638 |
+
super_glue;cb;allenai/c4 (realnewslike);corpus;;;0.0;data-based;https://arxiv.org/abs/2210.04261;15
|
639 |
+
super_glue;copa;allenai/c4 (realnewslike);corpus;;;0.0;data-based;https://arxiv.org/abs/2210.04261;15
|
640 |
+
super_glue;multirc;allenai/c4 (realnewslike);corpus;;;1.2;data-based;https://arxiv.org/abs/2210.04261;15
|
641 |
+
super_glue;record;allenai/c4 (realnewslike);corpus;;;7.3;data-based;https://arxiv.org/abs/2210.04261;15
|
642 |
+
super_glue;rte;allenai/c4 (realnewslike);corpus;;;1.1;data-based;https://arxiv.org/abs/2210.04261;15
|
643 |
+
super_glue;wic;allenai/c4 (realnewslike);corpus;;;0.0;data-based;https://arxiv.org/abs/2210.04261;15
|
644 |
+
super_glue;wsc;allenai/c4 (realnewslike);corpus;;;0.0;data-based;https://arxiv.org/abs/2210.04261;15
|
645 |
+
|