Muennighoff's picture
Organize
5754a9f
raw
history blame
13.9 kB
dataset,prompt,metric,value
amazon_reviews_multi_en,prompt_body_title_to_star,accuracy,0.5508
amazon_reviews_multi_en,prompt_review_to_star,accuracy,0.51
amazon_reviews_multi_en,prompt_title_to_star,accuracy,0.3758
amazon_reviews_multi_en,median,accuracy,0.51
amazon_reviews_multi_es,prompt_body_title_to_star,accuracy,0.4776
amazon_reviews_multi_es,prompt_review_to_star,accuracy,0.4444
amazon_reviews_multi_es,prompt_title_to_star,accuracy,0.3088
amazon_reviews_multi_es,median,accuracy,0.4444
amazon_reviews_multi_fr,prompt_body_title_to_star,accuracy,0.4742
amazon_reviews_multi_fr,prompt_review_to_star,accuracy,0.4492
amazon_reviews_multi_fr,prompt_title_to_star,accuracy,0.3192
amazon_reviews_multi_fr,median,accuracy,0.4492
amazon_reviews_multi_zh,prompt_body_title_to_star,accuracy,0.4712
amazon_reviews_multi_zh,prompt_review_to_star,accuracy,0.4478
amazon_reviews_multi_zh,prompt_title_to_star,accuracy,0.3208
amazon_reviews_multi_zh,median,accuracy,0.4478
aqua_rat_raw,Answer questions from options,accuracy,0.2440944881889764
aqua_rat_raw,answer_quiz,accuracy,0.23228346456692914
aqua_rat_raw,select_the_best_option,accuracy,0.25196850393700787
aqua_rat_raw,median,accuracy,0.2440944881889764
art_None,choose_hypothesis,accuracy,0.6109660574412533
art_None,choose_hypothesis_believable,accuracy,0.5926892950391645
art_None,choose_hypothesis_desc,accuracy,0.554177545691906
art_None,choose_hypothesis_likely,accuracy,0.5587467362924282
art_None,choose_hypothesis_options,accuracy,0.5842036553524804
art_None,median,accuracy,0.5842036553524804
banking77_None,direct_to_which_department,accuracy,0.19675324675324676
banking77_None,help_page_topic,accuracy,0.2538961038961039
banking77_None,rephrase_as_banking_term,accuracy,0.2636363636363636
banking77_None,median,accuracy,0.2538961038961039
blbooksgenre_title_genre_classifiction,classify,accuracy,0.27764976958525345
blbooksgenre_title_genre_classifiction,multi-choice,accuracy,0.8456221198156681
blbooksgenre_title_genre_classifiction,premise_context_first,accuracy,0.7494239631336406
blbooksgenre_title_genre_classifiction,median,accuracy,0.7494239631336406
blimp_adjunct_island,grammatical_between_1_2,accuracy,0.516
blimp_adjunct_island,grammatical_between_A_B,accuracy,0.428
blimp_adjunct_island,grammatical_which_one_1_2,accuracy,0.576
blimp_adjunct_island,single_sentence_bad_yes_no,accuracy,0.478
blimp_adjunct_island,single_sentence_good_yes_no,accuracy,0.499
blimp_adjunct_island,median,accuracy,0.499
climate_fever_None,claim_and_all_supporting_evidences,accuracy,0.4273615635179153
climate_fever_None,fifth_evidence_and_claim_itemization,accuracy,0.43973941368078173
climate_fever_None,first_evidence_and_claim_itemization,accuracy,0.3283387622149837
climate_fever_None,second_evidence_and_claim_itemization,accuracy,0.47687296416938113
climate_fever_None,third_evidence_claim_pair,accuracy,0.4586319218241042
climate_fever_None,median,accuracy,0.43973941368078173
codah_codah,affirmative_instruction_after_sentence_and_choices,accuracy,0.7327089337175793
codah_codah,affirmative_instruction_before_sentence_and_choices,accuracy,0.7359510086455331
codah_codah,interrogative_instruction_after_sentence_and_choices,accuracy,0.736671469740634
codah_codah,median,accuracy,0.7359510086455331
commonsense_qa_None,answer_given_question_without_options,accuracy,0.6224406224406225
commonsense_qa_None,most_suitable_answer,accuracy,0.8435708435708436
commonsense_qa_None,question_answering,accuracy,0.8304668304668305
commonsense_qa_None,median,accuracy,0.8304668304668305
conv_ai_3_None,ambiguous,accuracy,0.39040207522697795
conv_ai_3_None,clarification_needed,accuracy,0.39040207522697795
conv_ai_3_None,directly_answer,accuracy,0.6095979247730221
conv_ai_3_None,score_give_number,accuracy,0.37959360138348464
conv_ai_3_None,score_how_much,accuracy,0.03285776048421963
conv_ai_3_None,median,accuracy,0.39040207522697795
craigslist_bargains_None,best deal,accuracy,0.49246231155778897
craigslist_bargains_None,good deal for seller,accuracy,0.4371859296482412
craigslist_bargains_None,good deal for seller no list price,accuracy,0.6046901172529313
craigslist_bargains_None,good deal for seller no list price implicit,accuracy,0.25963149078726966
craigslist_bargains_None,median,accuracy,0.4648241206030151
emotion_None,answer_question_with_emotion_label,accuracy,0.344
emotion_None,answer_with_class_label,accuracy,0.2295
emotion_None,choose_the_best_emotion_label,accuracy,0.317
emotion_None,reply_with_emoation_label,accuracy,0.5025
emotion_None,median,accuracy,0.3305
financial_phrasebank_sentences_allagree,bullish_neutral_bearish,accuracy,0.3710247349823322
financial_phrasebank_sentences_allagree,complementary_industries,accuracy,0.04637809187279152
financial_phrasebank_sentences_allagree,sentiment,accuracy,0.3489399293286219
financial_phrasebank_sentences_allagree,share_price_option,accuracy,0.37146643109540634
financial_phrasebank_sentences_allagree,word_comes_to_mind,accuracy,0.01987632508833922
financial_phrasebank_sentences_allagree,median,accuracy,0.3489399293286219
glue_cola,Following sentence acceptable,accuracy,0.5685522531160115
glue_cola,Make sense yes no,accuracy,0.3326941514860978
glue_cola,Previous sentence acceptable,accuracy,0.3096836049856184
glue_cola,editing,accuracy,0.3144774688398849
glue_cola,is_this_correct,accuracy,0.4592521572387344
glue_cola,median,accuracy,0.3326941514860978
glue_sst2,following positive negative,accuracy,0.9415137614678899
glue_sst2,happy or mad,accuracy,0.9013761467889908
glue_sst2,positive negative after,accuracy,0.9461009174311926
glue_sst2,review,accuracy,0.9403669724770642
glue_sst2,said,accuracy,0.9185779816513762
glue_sst2,median,accuracy,0.9403669724770642
head_qa_en,multiple_choice_a_and_q_en,accuracy,0.27379209370424595
head_qa_en,multiple_choice_a_and_q_with_context_en,accuracy,0.2730600292825769
head_qa_en,multiple_choice_q_and_a_en,accuracy,0.40922401171303074
head_qa_en,multiple_choice_q_and_a_index_en,accuracy,0.3916544655929722
head_qa_en,multiple_choice_q_and_a_index_with_context_en,accuracy,0.3857979502196193
head_qa_en,median,accuracy,0.3857979502196193
head_qa_es,multiple_choice_a_and_q_en,accuracy,0.2679355783308931
head_qa_es,multiple_choice_a_and_q_with_context_en,accuracy,0.2642752562225476
head_qa_es,multiple_choice_q_and_a_en,accuracy,0.39751098096632503
head_qa_es,multiple_choice_q_and_a_index_en,accuracy,0.3506588579795022
head_qa_es,multiple_choice_q_and_a_index_with_context_en,accuracy,0.3440702781844802
head_qa_es,median,accuracy,0.3440702781844802
health_fact_None,claim_explanation_classification,accuracy,0.5755102040816327
health_fact_None,claim_veracity_classification_after_reading_I_believe,accuracy,0.31510204081632653
health_fact_None,claim_veracity_classification_tell_me,accuracy,0.053877551020408164
health_fact_None,median,accuracy,0.31510204081632653
hlgd_None,is_same_event_editor_asks,accuracy,0.6230062832286128
hlgd_None,is_same_event_interrogative_talk,accuracy,0.7056549057515709
hlgd_None,is_same_event_refer,accuracy,0.6457225712904785
hlgd_None,is_same_event_with_time_interrogative_related,accuracy,0.7873368777187046
hlgd_None,is_same_event_with_time_interrogative_talk,accuracy,0.8182696955050749
hlgd_None,median,accuracy,0.7056549057515709
hyperpartisan_news_detection_byarticle,consider_does_it_follow_a_hyperpartisan_argumentation,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,consider_it_exhibits_extreme_one_sidedness,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,consume_with_caution,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,extreme_left_wing_or_right_wing,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,follows_hyperpartisan_argumentation,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,median,accuracy,0.6310077519379845
liar_None,Given statement guess category,accuracy,0.20794392523364486
liar_None,median,accuracy,0.20794392523364486
lince_sa_spaeng,express sentiment,accuracy,0.5814954276492738
lince_sa_spaeng,negation template,accuracy,0.16621839698762775
lince_sa_spaeng,original poster expressed sentiment,accuracy,0.5831091984938139
lince_sa_spaeng,sentiment trying to express,accuracy,0.584722969338354
lince_sa_spaeng,the author seem,accuracy,0.5691231845077999
lince_sa_spaeng,median,accuracy,0.5814954276492738
math_qa_None,choose_correct_og,accuracy,0.2100502512562814
math_qa_None,first_choice_then_problem,accuracy,0.21708542713567838
math_qa_None,gre_problem,accuracy,0.20871021775544388
math_qa_None,pick_the_correct,accuracy,0.21139028475711893
math_qa_None,problem_set_type,accuracy,0.38123953098827473
math_qa_None,median,accuracy,0.21139028475711893
mlsum_es,layman_summ_es,bleu,0.03612948631805906
mlsum_es,palm_prompt,bleu,0.048150532126973386
mlsum_es,summarise_this_in_es_few_sentences,bleu,0.03385324638634216
mlsum_es,median,bleu,0.03612948631805906
movie_rationales_None,Evidences + review,accuracy,0.975
movie_rationales_None,Evidences sentiment classification,accuracy,0.99
movie_rationales_None,Standard binary sentiment analysis,accuracy,0.95
movie_rationales_None,median,accuracy,0.975
mwsc_None,in-the-sentence,accuracy,0.5853658536585366
mwsc_None,in-the-sentence-question-first,accuracy,0.6219512195121951
mwsc_None,is-correct,accuracy,0.5487804878048781
mwsc_None,options-or,accuracy,0.6951219512195121
mwsc_None,what-think,accuracy,0.6951219512195121
mwsc_None,median,accuracy,0.6219512195121951
onestop_english_None,ara_context,accuracy,0.2945326278659612
onestop_english_None,assess,accuracy,0.54673721340388
onestop_english_None,determine_reading_level_from_the_first_three_sentences,accuracy,0.37918871252204583
onestop_english_None,esl_context,accuracy,0.6402116402116402
onestop_english_None,esl_variation,accuracy,0.5961199294532628
onestop_english_None,median,accuracy,0.54673721340388
poem_sentiment_None,guess_sentiment_without_options_variation_1,accuracy,0.21904761904761905
poem_sentiment_None,most_appropriate_sentiment,accuracy,0.29523809523809524
poem_sentiment_None,positive_or_negative_sentiment_variation_1,accuracy,0.21904761904761905
poem_sentiment_None,positive_or_negative_sentiment_variation_2,accuracy,0.22857142857142856
poem_sentiment_None,question_answer_format,accuracy,0.2571428571428571
poem_sentiment_None,median,accuracy,0.22857142857142856
pubmed_qa_pqa_labeled,Long Answer to Final Decision,accuracy,0.648
pubmed_qa_pqa_labeled,Question Answering (Short),accuracy,0.695
pubmed_qa_pqa_labeled,median,accuracy,0.6715
riddle_sense_None,answer_given_question_without_options,accuracy,0.48090107737512244
riddle_sense_None,most_suitable_answer,accuracy,0.40254652301665034
riddle_sense_None,question_answering,accuracy,0.3868756121449559
riddle_sense_None,question_to_answer_index,accuracy,0.3702252693437806
riddle_sense_None,median,accuracy,0.3947110675808031
scicite_None,Classify intent,accuracy,0.20414847161572053
scicite_None,Classify intent (choices first),accuracy,0.21069868995633187
scicite_None,Classify intent (select choice),accuracy,0.45414847161572053
scicite_None,Classify intent w/section (select choice),accuracy,0.5032751091703057
scicite_None,can_describe,accuracy,0.34279475982532753
scicite_None,median,accuracy,0.34279475982532753
selqa_answer_selection_analysis,is-he-talking-about,accuracy,0.9031847133757962
selqa_answer_selection_analysis,make-sense-rand,accuracy,0.8815286624203822
selqa_answer_selection_analysis,which-answer-1st-vs-random,accuracy,0.8726114649681529
selqa_answer_selection_analysis,would-make-sense-qu-rand,accuracy,0.9121019108280255
selqa_answer_selection_analysis,median,accuracy,0.8923566878980892
snips_built_in_intents_None,categorize_query,accuracy,0.39939024390243905
snips_built_in_intents_None,categorize_query_brief,accuracy,0.36585365853658536
snips_built_in_intents_None,intent_query,accuracy,0.31097560975609756
snips_built_in_intents_None,query_intent,accuracy,0.5823170731707317
snips_built_in_intents_None,voice_intent,accuracy,0.5762195121951219
snips_built_in_intents_None,median,accuracy,0.39939024390243905
wmt14_fr_en_en-fr,a_good_translation-en-fr-source+target,bleu,0.03901997019133066
wmt14_fr_en_en-fr,a_good_translation-en-fr-target,bleu,0.013934207960053381
wmt14_fr_en_en-fr,gpt3-en-fr,bleu,0.0008726814351547542
wmt14_fr_en_en-fr,version-en-fr-target,bleu,0.04126763289443808
wmt14_fr_en_en-fr,xglm-en-fr-target,bleu,0.2594147632125033
wmt14_fr_en_en-fr,median,bleu,0.03901997019133066
wmt14_fr_en_fr-en,a_good_translation-fr-en-source+target,bleu,0.29535567491027065
wmt14_fr_en_fr-en,a_good_translation-fr-en-target,bleu,0.10053995021986518
wmt14_fr_en_fr-en,gpt3-fr-en,bleu,0.05996411710924088
wmt14_fr_en_fr-en,version-fr-en-target,bleu,0.2543366934119538
wmt14_fr_en_fr-en,xglm-fr-en-target,bleu,0.289915194963351
wmt14_fr_en_fr-en,median,bleu,0.2543366934119538
wmt14_hi_en_en-hi,a_good_translation-en-hi-source+target,bleu,0.006990276538877561
wmt14_hi_en_en-hi,a_good_translation-en-hi-target,bleu,0.0018050206530453908
wmt14_hi_en_en-hi,gpt-3-en-hi-target,bleu,2.984520737729336e-10
wmt14_hi_en_en-hi,version-en-hi-target,bleu,0.007268866226269155
wmt14_hi_en_en-hi,xglm-en-hi-target,bleu,0.06785861030301621
wmt14_hi_en_en-hi,median,bleu,0.006990276538877561
wmt14_hi_en_hi-en,a_good_translation-hi-en-source+target,bleu,0.15724256465201472
wmt14_hi_en_hi-en,a_good_translation-hi-en-target,bleu,0.06515805969434861
wmt14_hi_en_hi-en,gpt-3-hi-en-target,bleu,1.9706666216345307e-162
wmt14_hi_en_hi-en,version-hi-en-target,bleu,0.15422032309127792
wmt14_hi_en_hi-en,xglm-hi-en-target,bleu,0.17022583047573708
wmt14_hi_en_hi-en,median,bleu,0.15422032309127792
multiple,average,multiple,0.4485518661820451