# ruff: noqa: F405, F403, F401 """ Custom evaluation tasks for lighteval This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. """ import re from dataclasses import asdict from typing import Dict, List, Tuple from custom_evaluation_utils import * from lighteval.tasks.requests import Doc # fmt: off LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"] # fmt: on _TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = [] _TASKS: List[CustomEvaluationTask] = [] ## COMMON_SENSE_REASONING_TASKS ## COMMON_SENSE_REASONING_TASKS = [ CustomEvaluationTask( name="hellaswag", prompt_function="hellaswag_prompt", hf_repo="hellaswag", hf_subset="default", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), CustomEvaluationTask( name="winogrande", prompt_function="winogrande", hf_repo="winogrande", hf_subset="winogrande_xl", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), CustomEvaluationTask( name="piqa", prompt_function="piqa_harness", hf_repo="piqa", hf_subset="plain_text", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), CustomEvaluationTask( name="siqa", prompt_function="siqa_prompt", hf_repo="lighteval/siqa", hf_subset="default", hf_avail_splits=["train", "validation"], metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), CustomEvaluationTask( name="openbookqa", prompt_function="openbookqa", hf_repo="openbookqa", hf_subset="main", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), CustomEvaluationTask( name="arc:easy", prompt_function="arc", hf_repo="ai2_arc", hf_subset="ARC-Easy", evaluation_splits=["test"], generation_size=1, metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), CustomEvaluationTask( name="arc:challenge", prompt_function="arc", hf_repo="ai2_arc", hf_subset="ARC-Challenge", evaluation_splits=["test"], generation_size=1, metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), CustomEvaluationTask( name="commonsense_qa", prompt_function="commonsense_qa_prompt", hf_repo="commonsense_qa", hf_subset="default", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], ), ] def commonsense_qa_prompt(line, task_name: str = None): return Doc( task_name=task_name, query=line["question"], choices=[f" {c}" for c in line["choices"]["text"]], gold_index=LETTER_INDICES.index(line["answerKey"].strip()), instruction="", ) def siqa_prompt(line, task_name: str = None): return Doc( task_name=task_name, query=line["context"] + " " + line["question"], choices=[f" {c}" for c in [line["answerA"], line["answerB"], line["answerC"]]], gold_index=int(line["label"]) - 1, instruction="", ) def hellaswag_prompt(line, task_name: str = None): def preprocess(text): """Comes from AiHarness""" # text = text.strip() # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag. text = text.replace(" [title]", ". ") text = re.sub("\\[.*?\\]", "", text) text = text.replace(" ", " ") return text ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} " return Doc( task_name=task_name, query=preprocess(line["activity_label"] + ": " + ctx), choices=[" " + preprocess(ending) for ending in line["endings"]], gold_index=int(line["label"]) if line["label"] != "" else -1, # -1 for test # "metric": "choices_loglikelihood", ) # 0 short for common sense COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS] _TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING) _TASKS += COMMON_SENSE_REASONING_TASKS ## WORLD_KNOWLEDGE_TASKS ## WORLD_KNOWLEDGE_TASKS = [ CustomEvaluationTask( name="trivia_qa", prompt_function="triviaqa", hf_repo="trivia_qa", hf_subset="rc.nocontext", metric=[Metrics.quasi_exact_match2], generation_size=20, stop_sequence=["\n", ".", ","], ), CustomEvaluationTask( name="natural_questions", prompt_function="natural_questions_prompt", hf_repo="lighteval/natural_questions_clean", hf_subset="default", metric=[Metrics.quasi_exact_match2], generation_size=20, stop_sequence=["\n", ".", ","], ), ] def natural_questions_prompt(line, task_name: str = None): return Doc( task_name=task_name, query=line["question"] + "?\nAnswer: ", choices=[line["short_answers"]], gold_index=0, instruction="", ) WORLD_KNOWLEDGE_STRING = [(t, f"custom|{t.name}|5|1") for t in WORLD_KNOWLEDGE_TASKS] # WORLD_KNOWLEDGE_STRING = {t: f'custom|{t.name}|0|1' for t in WORLD_KNOWLEDGE_TASKS} _TASKS_STRINGS.extend(WORLD_KNOWLEDGE_STRING) _TASKS += WORLD_KNOWLEDGE_TASKS ## Reading comprehension ## READING_COMP_TASKS = [ CustomEvaluationTask( name="super_glue:boolq", prompt_function="boolq_prompt", hf_repo="super_glue", hf_subset="boolq", metric=[Metrics.target_perplexity], ), CustomEvaluationTask( name="quac", prompt_function="quac", hf_repo="lighteval/quac_helm", hf_subset="default", metric=[Metrics.quasi_exact_match2], generation_size=20, stop_sequence=["\n", ".", ","], ), ] def boolq_prompt(line, task_name: str = None): return Doc( task_name=task_name, query=f"{line['passage']}\nQuestion: {line['question'].capitalize()}?\nAnswer:", choices=[" No", " Yes"], # Only gold gold_index=int(line["label"]), ) READING_COMP_STRING = [(t, f"custom|{t.name}|0|1") for t in READING_COMP_TASKS] _TASKS_STRINGS.extend(READING_COMP_STRING) _TASKS += READING_COMP_TASKS ## MATH ## class CustomMathEvaluationTask(CustomEvaluationTask): """Custom class for math tasks with all the defaults set""" def __init__( self, name, prompt_function="math", hf_repo="lighteval/MATH", hf_subset=None, metric=[Metrics.math_quasi_exact_match], hf_avail_splits=None, evaluation_splits=["test"], few_shots_split=None, few_shots_select=None, suite=["custom"], generation_size=40, stop_sequence=None, output_regex=None, frozen=False, ): super().__init__( name=name, prompt_function=prompt_function, hf_repo=hf_repo, hf_subset=hf_subset, metric=metric, hf_avail_splits=hf_avail_splits, evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, few_shots_select=few_shots_select, suite=suite, generation_size=generation_size, stop_sequence=stop_sequence, output_regex=output_regex, frozen=frozen, ) MATH_TASKS = [ CustomMathEvaluationTask(name="math:algebra", hf_subset="algebra"), CustomMathEvaluationTask(name="math:counting_and_probability", hf_subset="counting_and_probability"), CustomMathEvaluationTask(name="math:geometry", hf_subset="geometry"), CustomMathEvaluationTask(name="math:intermediate_algebra", hf_subset="intermediate_algebra"), CustomMathEvaluationTask(name="math:number_theory", hf_subset="number_theory"), CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"), CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"), ] GSM8K = CustomEvaluationTask( name="gsm8k", prompt_function="gsm8k", hf_repo="gsm8k", hf_subset="main", hf_avail_splits=["train", "test"], evaluation_splits=["test"], metric=[Metrics.perfect_exact_match], generation_size=10, stop_sequence=["\n"], ) MATH_STRING = [(t, f"custom|{t.name}|4|1") for t in MATH_TASKS] GSM8K_STRING = [(GSM8K, f"custom|{GSM8K.name}|8|1")] _TASKS_STRINGS.extend(MATH_STRING) _TASKS_STRINGS.extend(GSM8K_STRING) _TASKS += MATH_TASKS + [GSM8K] ## MMLU ## class CustomMMLUEvaluationTask(CustomEvaluationTask): def __init__( self, name, prompt_function="mmlu_prompt", hf_repo="lighteval/mmlu", hf_subset=None, # metric=[Metrics.loglikelihood_acc_single_token], metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], hf_avail_splits=None, evaluation_splits=["test"], few_shots_split="dev", few_shots_select=None, suite=None, generation_size=-1, stop_sequence=None, output_regex=None, frozen=False, ): super().__init__( name=name, prompt_function=prompt_function, hf_repo=hf_repo, hf_subset=hf_subset, metric=metric, hf_avail_splits=hf_avail_splits, evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, few_shots_select=few_shots_select, suite=suite, generation_size=generation_size, stop_sequence=stop_sequence, output_regex=output_regex, frozen=frozen, ) MMLU_TASKS = [ CustomMMLUEvaluationTask(name="mmlu:abstract_algebra", hf_subset="abstract_algebra"), CustomMMLUEvaluationTask(name="mmlu:anatomy", hf_subset="anatomy"), CustomMMLUEvaluationTask(name="mmlu:astronomy", hf_subset="astronomy"), CustomMMLUEvaluationTask(name="mmlu:business_ethics", hf_subset="business_ethics"), CustomMMLUEvaluationTask(name="mmlu:clinical_knowledge", hf_subset="clinical_knowledge"), CustomMMLUEvaluationTask(name="mmlu:college_biology", hf_subset="college_biology"), CustomMMLUEvaluationTask(name="mmlu:college_chemistry", hf_subset="college_chemistry"), CustomMMLUEvaluationTask(name="mmlu:college_computer_science", hf_subset="college_computer_science"), CustomMMLUEvaluationTask(name="mmlu:college_mathematics", hf_subset="college_mathematics"), CustomMMLUEvaluationTask(name="mmlu:college_medicine", hf_subset="college_medicine"), CustomMMLUEvaluationTask(name="mmlu:college_physics", hf_subset="college_physics"), CustomMMLUEvaluationTask(name="mmlu:computer_security", hf_subset="computer_security"), CustomMMLUEvaluationTask(name="mmlu:conceptual_physics", hf_subset="conceptual_physics"), CustomMMLUEvaluationTask(name="mmlu:econometrics", hf_subset="econometrics"), CustomMMLUEvaluationTask(name="mmlu:electrical_engineering", hf_subset="electrical_engineering"), CustomMMLUEvaluationTask(name="mmlu:elementary_mathematics", hf_subset="elementary_mathematics"), CustomMMLUEvaluationTask(name="mmlu:formal_logic", hf_subset="formal_logic"), CustomMMLUEvaluationTask(name="mmlu:global_facts", hf_subset="global_facts"), CustomMMLUEvaluationTask(name="mmlu:high_school_biology", hf_subset="high_school_biology"), CustomMMLUEvaluationTask(name="mmlu:high_school_chemistry", hf_subset="high_school_chemistry"), CustomMMLUEvaluationTask(name="mmlu:high_school_computer_science", hf_subset="high_school_computer_science"), CustomMMLUEvaluationTask(name="mmlu:high_school_european_history", hf_subset="high_school_european_history"), CustomMMLUEvaluationTask(name="mmlu:high_school_geography", hf_subset="high_school_geography"), CustomMMLUEvaluationTask( name="mmlu:high_school_government_and_politics", hf_subset="high_school_government_and_politics" ), CustomMMLUEvaluationTask(name="mmlu:high_school_macroeconomics", hf_subset="high_school_macroeconomics"), CustomMMLUEvaluationTask(name="mmlu:high_school_mathematics", hf_subset="high_school_mathematics"), CustomMMLUEvaluationTask(name="mmlu:high_school_microeconomics", hf_subset="high_school_microeconomics"), CustomMMLUEvaluationTask(name="mmlu:high_school_physics", hf_subset="high_school_physics"), CustomMMLUEvaluationTask(name="mmlu:high_school_psychology", hf_subset="high_school_psychology"), CustomMMLUEvaluationTask(name="mmlu:high_school_statistics", hf_subset="high_school_statistics"), CustomMMLUEvaluationTask(name="mmlu:high_school_us_history", hf_subset="high_school_us_history"), CustomMMLUEvaluationTask(name="mmlu:high_school_world_history", hf_subset="high_school_world_history"), CustomMMLUEvaluationTask(name="mmlu:human_aging", hf_subset="human_aging"), CustomMMLUEvaluationTask(name="mmlu:human_sexuality", hf_subset="human_sexuality"), CustomMMLUEvaluationTask(name="mmlu:international_law", hf_subset="international_law"), CustomMMLUEvaluationTask(name="mmlu:jurisprudence", hf_subset="jurisprudence"), CustomMMLUEvaluationTask(name="mmlu:logical_fallacies", hf_subset="logical_fallacies"), CustomMMLUEvaluationTask(name="mmlu:machine_learning", hf_subset="machine_learning"), CustomMMLUEvaluationTask(name="mmlu:management", hf_subset="management"), CustomMMLUEvaluationTask(name="mmlu:marketing", hf_subset="marketing"), CustomMMLUEvaluationTask(name="mmlu:medical_genetics", hf_subset="medical_genetics"), CustomMMLUEvaluationTask(name="mmlu:miscellaneous", hf_subset="miscellaneous"), CustomMMLUEvaluationTask(name="mmlu:moral_disputes", hf_subset="moral_disputes"), CustomMMLUEvaluationTask(name="mmlu:moral_scenarios", hf_subset="moral_scenarios"), CustomMMLUEvaluationTask(name="mmlu:nutrition", hf_subset="nutrition"), CustomMMLUEvaluationTask(name="mmlu:philosophy", hf_subset="philosophy"), CustomMMLUEvaluationTask(name="mmlu:prehistory", hf_subset="prehistory"), CustomMMLUEvaluationTask(name="mmlu:professional_accounting", hf_subset="professional_accounting"), CustomMMLUEvaluationTask(name="mmlu:professional_law", hf_subset="professional_law"), CustomMMLUEvaluationTask(name="mmlu:professional_medicine", hf_subset="professional_medicine"), CustomMMLUEvaluationTask(name="mmlu:professional_psychology", hf_subset="professional_psychology"), CustomMMLUEvaluationTask(name="mmlu:public_relations", hf_subset="public_relations"), CustomMMLUEvaluationTask(name="mmlu:security_studies", hf_subset="security_studies"), CustomMMLUEvaluationTask(name="mmlu:sociology", hf_subset="sociology"), CustomMMLUEvaluationTask(name="mmlu:us_foreign_policy", hf_subset="us_foreign_policy"), CustomMMLUEvaluationTask(name="mmlu:virology", hf_subset="virology"), CustomMMLUEvaluationTask(name="mmlu:world_religions", hf_subset="world_religions"), ] def mmlu_harness(line, task_name: str = None): topic = line["subject"] prompt = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n" prompt += line["question"] + "\n" prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])]) prompt += "Answer:" gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"] "__few_shots" in line and line["__few_shots"] is True # We are adding few shots return Doc( task_name=task_name, query=prompt, choices=[" A", " B", " C", " D"], target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix], gold_index=gold_ix, instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n", ) def mmlu_prompt(line, task_name: str = None): """MMLU prompt without letters""" topic = line["subject"] prompt = f"The following are questions about {topic.replace('_', ' ')}.\nQuestion: " prompt += line["question"] + "\nAnswer:" return Doc( task_name=task_name, query=prompt, choices=[f" {c}" for c in line["choices"]], gold_index=line["answer"], instruction=f"The following are questions about {topic.replace('_', ' ')}.\n", ) # MMLU_STRING = {t: f'custom|{t.name}|5|1' for t in MMLU_TASKS} MMLU_STRING = [(t, f"custom|{t.name}|0|1") for t in MMLU_TASKS] _TASKS_STRINGS.extend(MMLU_STRING) _TASKS += MMLU_TASKS ## BBH ## class CustomBBHEvaluationTask(CustomEvaluationTask): def __init__( self, name, prompt_function="bbh_prompt", hf_repo="lighteval/big_bench_hard", hf_subset=None, metric=[Metrics.exact_match], hf_avail_splits=["train"], evaluation_splits=["train"], few_shots_split="train", few_shots_select=None, suite=None, generation_size=4, stop_sequence=None, output_regex=None, frozen=False, ): super().__init__( name=name, prompt_function=prompt_function, hf_repo=hf_repo, hf_subset=hf_subset, metric=metric, hf_avail_splits=hf_avail_splits, evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, few_shots_select=few_shots_select, suite=suite, generation_size=generation_size, stop_sequence=stop_sequence, output_regex=output_regex, frozen=frozen, ) BBH_TASKS = [ CustomBBHEvaluationTask(name="bbh:boolean_expressions", hf_subset="boolean_expressions"), CustomBBHEvaluationTask(name="bbh:causal_judgement", hf_subset="causal_judgement"), CustomBBHEvaluationTask(name="bbh:date_understanding", hf_subset="date_understanding"), CustomBBHEvaluationTask(name="bbh:disambiguation_qa", hf_subset="disambiguation_qa"), CustomBBHEvaluationTask(name="bbh:dyck_languages", hf_subset="dyck_languages"), CustomBBHEvaluationTask(name="bbh:formal_fallacies", hf_subset="formal_fallacies"), CustomBBHEvaluationTask(name="bbh:geometric_shapes", hf_subset="geometric_shapes"), CustomBBHEvaluationTask(name="bbh:hyperbaton", hf_subset="hyperbaton"), CustomBBHEvaluationTask(name="bbh:logical_deduction_five_objects", hf_subset="logical_deduction_five_objects"), CustomBBHEvaluationTask(name="bbh:logical_deduction_seven_objects", hf_subset="logical_deduction_seven_objects"), CustomBBHEvaluationTask(name="bbh:logical_deduction_three_objects", hf_subset="logical_deduction_three_objects"), CustomBBHEvaluationTask(name="bbh:movie_recommendation", hf_subset="movie_recommendation"), CustomBBHEvaluationTask(name="bbh:multistep_arithmetic_two", hf_subset="multistep_arithmetic_two"), CustomBBHEvaluationTask(name="bbh:navigate", hf_subset="navigate"), CustomBBHEvaluationTask(name="bbh:object_counting", hf_subset="object_counting"), CustomBBHEvaluationTask(name="bbh:penguins_in_a_table", hf_subset="penguins_in_a_table"), CustomBBHEvaluationTask(name="bbh:reasoning_about_colored_objects", hf_subset="reasoning_about_colored_objects"), CustomBBHEvaluationTask(name="bbh:ruin_names", hf_subset="ruin_names"), CustomBBHEvaluationTask( name="bbh:salient_translation_error_detection", hf_subset="salient_translation_error_detection" ), CustomBBHEvaluationTask(name="bbh:snarks", hf_subset="snarks"), CustomBBHEvaluationTask(name="bbh:sports_understanding", hf_subset="sports_understanding"), CustomBBHEvaluationTask(name="bbh:temporal_sequences", hf_subset="temporal_sequences"), CustomBBHEvaluationTask( name="bbh:tracking_shuffled_objects_five_objects", hf_subset="tracking_shuffled_objects_five_objects" ), CustomBBHEvaluationTask( name="bbh:tracking_shuffled_objects_seven_objects", hf_subset="tracking_shuffled_objects_seven_objects" ), CustomBBHEvaluationTask( name="bbh:tracking_shuffled_objects_three_objects", hf_subset="tracking_shuffled_objects_three_objects" ), CustomBBHEvaluationTask(name="bbh:web_of_lies", hf_subset="web_of_lies"), CustomBBHEvaluationTask(name="bbh:word_sorting", hf_subset="word_sorting"), ] def bbh_prompt(line, task_name: str = None): return Doc( task_name=task_name, query=line["input"] + "\nAnswer: ", choices=[line["target"]], gold_index=0, ) # BBH_STRING = {t: f'custom|{t.name}|3|1' for t in BBH_TASKS} BBH_STRING = [(t, f"custom|{t.name}|0|1") for t in BBH_TASKS] _TASKS_STRINGS.extend(BBH_STRING) _TASKS += BBH_TASKS ## AGI eval ## class CustomAGIEvalEvaluationTask(CustomEvaluationTask): def __init__( self, name, prompt_function="agi_eval_prompt_no_letters", hf_repo="lighteval/agi_eval_en", hf_subset=None, # metric=[Metrics.loglikelihood_acc_single_token], metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], hf_avail_splits=["train", "validation"], evaluation_splits=["train"], few_shots_split="validation", few_shots_select=None, suite=None, generation_size=-1, stop_sequence=None, output_regex=None, frozen=False, ): super().__init__( name=name, prompt_function=prompt_function, hf_repo=hf_repo, hf_subset=hf_subset, metric=metric, hf_avail_splits=hf_avail_splits, evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, few_shots_select=few_shots_select, suite=suite, generation_size=generation_size, stop_sequence=stop_sequence, output_regex=output_regex, frozen=frozen, ) AGIEVAL_TASKS = [ CustomAGIEvalEvaluationTask(name="agi_eval:aqua_rat", hf_subset="aqua_rat"), CustomAGIEvalEvaluationTask(name="agi_eval:logiqa-en", hf_subset="logiqa-en"), CustomAGIEvalEvaluationTask(name="agi_eval:lsat-ar", hf_subset="lsat-ar"), CustomAGIEvalEvaluationTask(name="agi_eval:lsat-lr", hf_subset="lsat-lr"), CustomAGIEvalEvaluationTask(name="agi_eval:lsat-rc", hf_subset="lsat-rc"), CustomAGIEvalEvaluationTask( name="agi_eval:math", hf_subset="math", prompt_function="agi_eval_math_prompt", metric=[Metrics.exact_match, Metrics.quasi_exact_match2], generation_size=40, ), CustomAGIEvalEvaluationTask(name="agi_eval:sat-en", hf_subset="sat-en"), CustomAGIEvalEvaluationTask(name="agi_eval:sat-math", hf_subset="sat-math"), ] def agi_eval_math_prompt(line, task_name: str = None): return Doc( task_name=task_name, query=line["question"], choices=[line["answer"]], gold_index=0, instruction="", ) def agi_eval_prompt(line, task_name: str = None): cleaned_options = [o.replace("(", "").replace(")", " ") for o in line["options"]] prompt = "The following are multiple choice questions (with answers).\n\n" prompt += line["question"] + "\n" + "\n".join(cleaned_options) + "\n" prompt += "Answer: " choices = LETTER_INDICES[: len(line["options"])] output = Doc( query=prompt, instruction="The following are multiple choice questions (with answers).\n\n", ) if line["label"]: output.choices = choices output.gold_index = LETTER_INDICES.index(line["label"].strip()) else: output.choices = [line["answer"]] output.gold_index = 0 return output def agi_eval_prompt_no_letters(line, task_name: str = None): cleaned_options = [ " " + o.replace("(A)", "").replace("(B)", "").replace("(C)", "").replace("(D)", "").replace("(E)", "") for o in line["options"] ] output = Doc( query=line["question"], choices=cleaned_options, gold_index=LETTER_INDICES.index(line["label"].strip()), instruction="", ) return output # AGIEVAL_STRING = {t: f'custom|{t.name}|5|1' for t in AGIEVAL_TASKS} AGIEVAL_STRING = [(t, f"custom|{t.name}|0|1") for t in AGIEVAL_TASKS] _TASKS_STRINGS.extend(AGIEVAL_STRING) _TASKS += AGIEVAL_TASKS ## HUMAN EVAL ## # human_eval = CustomEvaluationTask( # name="human_eval", # prompt_function="human_eval", # hf_repo="lighteval/human_eval", # metric=["human_eval_pass_at_1"], # ), def has_generative_metrics(task: CustomEvaluationTask) -> bool: for metric in task.metric: if metric in NEEDS_GENERATION_ONLY: return True return False EARLY_SIGNAL_TASKS = ",".join([t[1] for t in COMMON_SENSE_REASONING_STRING] + [t[1] for t in MMLU_STRING]) # Convert to dict for lighteval TASKS_TABLE = [asdict(task) for task in _TASKS] # You can have a few pre-organised groups of tasks TASKS_GROUPS = { "all": ",".join(t[1] for t in _TASKS_STRINGS), "early-signal": EARLY_SIGNAL_TASKS, "non-generatives": ",".join(t for k, t in _TASKS_STRINGS if not has_generative_metrics(k)), "generatives": ",".join(t for k, t in _TASKS_STRINGS if has_generative_metrics(k)), } if __name__ == "__main__": print(t["name"] for t in TASKS_TABLE) print(len(TASKS_TABLE))