update all

Browse files

Files changed (10) hide show

config_mistral_7b.py +2 -1
config_mistral_7b.yaml +1 -1
config_tiny_mistral.py +2 -1
custom_evaluation_tasks.py +0 -650
custom_evaluation_utils.py +0 -158
lighteval_eval_config.yaml +6 -20
modeling_mistral.py +1 -2
pretrained/Mistral-7B-v0.1/config.yaml +1 -1
run_evals.py +11 -394
run_generate.py +2 -3

config_mistral_7b.py CHANGED Viewed

@@ -66,7 +66,7 @@ PARALLELISM = ParallelismArgs(
 )
 CONFIG = Config(
-    general=GeneralArgs(project="mistralai", run="Mistral-7B-v0.1", seed=42),
     checkpoints=None,
     parallelism=PARALLELISM,
     model=ModelArgs(init_method=RandomInit(std=0.025), model_config=MODEL_CONFIG),
@@ -76,6 +76,7 @@ CONFIG = Config(
     tokens=None,
     data=None,
     profiler=None,
 )
 if __name__ == "__main__":

 )
 CONFIG = Config(
+    general=GeneralArgs(project="mistralai", run="Mistral-7B-v0.1", seed=42, step=0),
     checkpoints=None,
     parallelism=PARALLELISM,
     model=ModelArgs(init_method=RandomInit(std=0.025), model_config=MODEL_CONFIG),
     tokens=None,
     data=None,
     profiler=None,
+    lighteval=None,
 )
 if __name__ == "__main__":

config_mistral_7b.yaml CHANGED Viewed

@@ -7,7 +7,7 @@ general:
   project: mistralai
   run: Mistral-7B-v0.1
   seed: 42
-  step: null
 logging: null
 model:
   ddp_bucket_cap_mb: 25

   project: mistralai
   run: Mistral-7B-v0.1
   seed: 42
+  step: 0
 logging: null
 model:
   ddp_bucket_cap_mb: 25

config_tiny_mistral.py CHANGED Viewed

@@ -92,7 +92,7 @@ checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints"
 os.makedirs(checkpoints_path, exist_ok=True)
 config = Config(
-    general=GeneralArgs(project="debug", run="tiny_mistral", seed=seed),
     checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=10),
     parallelism=parallelism,
     model=ModelArgs(init_method=RandomInit(std=0.025), model_config=model_config),
@@ -102,6 +102,7 @@ config = Config(
     tokens=tokens,
     data=DataArgs(dataset=dataset, seed=seed),
     profiler=None,
 )
 if __name__ == "__main__":

 os.makedirs(checkpoints_path, exist_ok=True)
 config = Config(
+    general=GeneralArgs(project="debug", run="tiny_mistral", seed=seed, step=0),
     checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=10),
     parallelism=parallelism,
     model=ModelArgs(init_method=RandomInit(std=0.025), model_config=model_config),
     tokens=tokens,
     data=DataArgs(dataset=dataset, seed=seed),
     profiler=None,
+    lighteval=None,
 )
 if __name__ == "__main__":

custom_evaluation_tasks.py DELETED Viewed

@@ -1,650 +0,0 @@
-# ruff: noqa: F405, F403, F401
-"""
-Custom evaluation tasks for lighteval
-This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
-"""
-import re
-from dataclasses import asdict
-from typing import Dict, List, Tuple
-from custom_evaluation_utils import *
-from lighteval.tasks.requests import Doc
-# fmt: off
-LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
-# fmt: on
-_TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = []
-_TASKS: List[CustomEvaluationTask] = []
-## COMMON_SENSE_REASONING_TASKS ##
-COMMON_SENSE_REASONING_TASKS = [
-    CustomEvaluationTask(
-        name="hellaswag",
-        prompt_function="hellaswag_prompt",
-        hf_repo="hellaswag",
-        hf_subset="default",
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
-    ),
-    CustomEvaluationTask(
-        name="winogrande",
-        prompt_function="winogrande",
-        hf_repo="winogrande",
-        hf_subset="winogrande_xl",
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
-    ),
-    CustomEvaluationTask(
-        name="piqa",
-        prompt_function="piqa_harness",
-        hf_repo="piqa",
-        hf_subset="plain_text",
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
-    ),
-    CustomEvaluationTask(
-        name="siqa",
-        prompt_function="siqa_prompt",
-        hf_repo="lighteval/siqa",
-        hf_subset="default",
-        hf_avail_splits=["train", "validation"],
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
-    ),
-    CustomEvaluationTask(
-        name="openbookqa",
-        prompt_function="openbookqa",
-        hf_repo="openbookqa",
-        hf_subset="main",
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
-    ),
-    CustomEvaluationTask(
-        name="arc:easy",
-        prompt_function="arc",
-        hf_repo="ai2_arc",
-        hf_subset="ARC-Easy",
-        evaluation_splits=["test"],
-        generation_size=1,
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
-    ),
-    CustomEvaluationTask(
-        name="arc:challenge",
-        prompt_function="arc",
-        hf_repo="ai2_arc",
-        hf_subset="ARC-Challenge",
-        evaluation_splits=["test"],
-        generation_size=1,
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
-    ),
-    CustomEvaluationTask(
-        name="commonsense_qa",
-        prompt_function="commonsense_qa_prompt",
-        hf_repo="commonsense_qa",
-        hf_subset="default",
-        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
-    ),
-]
-def commonsense_qa_prompt(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=line["question"],
-        choices=[f" {c}" for c in line["choices"]["text"]],
-        gold_index=LETTER_INDICES.index(line["answerKey"].strip()),
-        instruction="",
-    )
-def siqa_prompt(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=line["context"] + " " + line["question"],
-        choices=[f" {c}" for c in [line["answerA"], line["answerB"], line["answerC"]]],
-        gold_index=int(line["label"]) - 1,
-        instruction="",
-    )
-def hellaswag_prompt(line, task_name: str = None):
-    def preprocess(text):
-        """Comes from AiHarness"""
-        # text = text.strip()
-        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
-        text = text.replace(" [title]", ". ")
-        text = re.sub("\\[.*?\\]", "", text)
-        text = text.replace("  ", " ")
-        return text
-    ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} "
-    return Doc(
-        task_name=task_name,
-        query=preprocess(line["activity_label"] + ": " + ctx),
-        choices=[" " + preprocess(ending) for ending in line["endings"]],
-        gold_index=int(line["label"]) if line["label"] != "" else -1,  # -1 for test
-        # "metric": "choices_loglikelihood",
-    )
-# 0 short for common sense
-COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS]
-_TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING)
-_TASKS += COMMON_SENSE_REASONING_TASKS
-## WORLD_KNOWLEDGE_TASKS ##
-WORLD_KNOWLEDGE_TASKS = [
-    CustomEvaluationTask(
-        name="trivia_qa",
-        prompt_function="triviaqa",
-        hf_repo="trivia_qa",
-        hf_subset="rc.nocontext",
-        metric=[Metrics.quasi_exact_match2],
-        generation_size=20,
-        stop_sequence=["\n", ".", ","],
-    ),
-    CustomEvaluationTask(
-        name="natural_questions",
-        prompt_function="natural_questions_prompt",
-        hf_repo="lighteval/natural_questions_clean",
-        hf_subset="default",
-        metric=[Metrics.quasi_exact_match2],
-        generation_size=20,
-        stop_sequence=["\n", ".", ","],
-    ),
-]
-def natural_questions_prompt(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=line["question"] + "?\nAnswer: ",
-        choices=[line["short_answers"]],
-        gold_index=0,
-        instruction="",
-    )
-WORLD_KNOWLEDGE_STRING = [(t, f"custom|{t.name}|5|1") for t in WORLD_KNOWLEDGE_TASKS]
-# WORLD_KNOWLEDGE_STRING = {t: f'custom|{t.name}|0|1' for t in WORLD_KNOWLEDGE_TASKS}
-_TASKS_STRINGS.extend(WORLD_KNOWLEDGE_STRING)
-_TASKS += WORLD_KNOWLEDGE_TASKS
-## Reading comprehension ##
-READING_COMP_TASKS = [
-    CustomEvaluationTask(
-        name="super_glue:boolq",
-        prompt_function="boolq_prompt",
-        hf_repo="super_glue",
-        hf_subset="boolq",
-        metric=[Metrics.target_perplexity],
-    ),
-    CustomEvaluationTask(
-        name="quac",
-        prompt_function="quac",
-        hf_repo="lighteval/quac_helm",
-        hf_subset="default",
-        metric=[Metrics.quasi_exact_match2],
-        generation_size=20,
-        stop_sequence=["\n", ".", ","],
-    ),
-]
-def boolq_prompt(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=f"{line['passage']}\nQuestion: {line['question'].capitalize()}?\nAnswer:",
-        choices=[" No", " Yes"],  # Only gold
-        gold_index=int(line["label"]),
-    )
-READING_COMP_STRING = [(t, f"custom|{t.name}|0|1") for t in READING_COMP_TASKS]
-_TASKS_STRINGS.extend(READING_COMP_STRING)
-_TASKS += READING_COMP_TASKS
-## MATH ##
-class CustomMathEvaluationTask(CustomEvaluationTask):
-    """Custom class for math tasks with all the defaults set"""
-    def __init__(
-        self,
-        name,
-        prompt_function="math",
-        hf_repo="lighteval/MATH",
-        hf_subset=None,
-        metric=[Metrics.math_quasi_exact_match],
-        hf_avail_splits=None,
-        evaluation_splits=["test"],
-        few_shots_split=None,
-        few_shots_select=None,
-        suite=["custom"],
-        generation_size=40,
-        stop_sequence=None,
-        output_regex=None,
-        frozen=False,
-    ):
-        super().__init__(
-            name=name,
-            prompt_function=prompt_function,
-            hf_repo=hf_repo,
-            hf_subset=hf_subset,
-            metric=metric,
-            hf_avail_splits=hf_avail_splits,
-            evaluation_splits=evaluation_splits,
-            few_shots_split=few_shots_split,
-            few_shots_select=few_shots_select,
-            suite=suite,
-            generation_size=generation_size,
-            stop_sequence=stop_sequence,
-            output_regex=output_regex,
-            frozen=frozen,
-        )
-MATH_TASKS = [
-    CustomMathEvaluationTask(name="math:algebra", hf_subset="algebra"),
-    CustomMathEvaluationTask(name="math:counting_and_probability", hf_subset="counting_and_probability"),
-    CustomMathEvaluationTask(name="math:geometry", hf_subset="geometry"),
-    CustomMathEvaluationTask(name="math:intermediate_algebra", hf_subset="intermediate_algebra"),
-    CustomMathEvaluationTask(name="math:number_theory", hf_subset="number_theory"),
-    CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"),
-    CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"),
-]
-GSM8K = CustomEvaluationTask(
-    name="gsm8k",
-    prompt_function="gsm8k",
-    hf_repo="gsm8k",
-    hf_subset="main",
-    hf_avail_splits=["train", "test"],
-    evaluation_splits=["test"],
-    metric=[Metrics.perfect_exact_match],
-    generation_size=10,
-    stop_sequence=["\n"],
-)
-MATH_STRING = [(t, f"custom|{t.name}|4|1") for t in MATH_TASKS]
-GSM8K_STRING = [(GSM8K, f"custom|{GSM8K.name}|8|1")]
-_TASKS_STRINGS.extend(MATH_STRING)
-_TASKS_STRINGS.extend(GSM8K_STRING)
-_TASKS += MATH_TASKS + [GSM8K]
-## MMLU ##
-class CustomMMLUEvaluationTask(CustomEvaluationTask):
-    def __init__(
-        self,
-        name,
-        prompt_function="mmlu_prompt",
-        hf_repo="lighteval/mmlu",
-        hf_subset=None,
-        #  metric=[Metrics.loglikelihood_acc_single_token],
-        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
-        hf_avail_splits=None,
-        evaluation_splits=["test"],
-        few_shots_split="dev",
-        few_shots_select=None,
-        suite=None,
-        generation_size=-1,
-        stop_sequence=None,
-        output_regex=None,
-        frozen=False,
-    ):
-        super().__init__(
-            name=name,
-            prompt_function=prompt_function,
-            hf_repo=hf_repo,
-            hf_subset=hf_subset,
-            metric=metric,
-            hf_avail_splits=hf_avail_splits,
-            evaluation_splits=evaluation_splits,
-            few_shots_split=few_shots_split,
-            few_shots_select=few_shots_select,
-            suite=suite,
-            generation_size=generation_size,
-            stop_sequence=stop_sequence,
-            output_regex=output_regex,
-            frozen=frozen,
-        )
-MMLU_TASKS = [
-    CustomMMLUEvaluationTask(name="mmlu:abstract_algebra", hf_subset="abstract_algebra"),
-    CustomMMLUEvaluationTask(name="mmlu:anatomy", hf_subset="anatomy"),
-    CustomMMLUEvaluationTask(name="mmlu:astronomy", hf_subset="astronomy"),
-    CustomMMLUEvaluationTask(name="mmlu:business_ethics", hf_subset="business_ethics"),
-    CustomMMLUEvaluationTask(name="mmlu:clinical_knowledge", hf_subset="clinical_knowledge"),
-    CustomMMLUEvaluationTask(name="mmlu:college_biology", hf_subset="college_biology"),
-    CustomMMLUEvaluationTask(name="mmlu:college_chemistry", hf_subset="college_chemistry"),
-    CustomMMLUEvaluationTask(name="mmlu:college_computer_science", hf_subset="college_computer_science"),
-    CustomMMLUEvaluationTask(name="mmlu:college_mathematics", hf_subset="college_mathematics"),
-    CustomMMLUEvaluationTask(name="mmlu:college_medicine", hf_subset="college_medicine"),
-    CustomMMLUEvaluationTask(name="mmlu:college_physics", hf_subset="college_physics"),
-    CustomMMLUEvaluationTask(name="mmlu:computer_security", hf_subset="computer_security"),
-    CustomMMLUEvaluationTask(name="mmlu:conceptual_physics", hf_subset="conceptual_physics"),
-    CustomMMLUEvaluationTask(name="mmlu:econometrics", hf_subset="econometrics"),
-    CustomMMLUEvaluationTask(name="mmlu:electrical_engineering", hf_subset="electrical_engineering"),
-    CustomMMLUEvaluationTask(name="mmlu:elementary_mathematics", hf_subset="elementary_mathematics"),
-    CustomMMLUEvaluationTask(name="mmlu:formal_logic", hf_subset="formal_logic"),
-    CustomMMLUEvaluationTask(name="mmlu:global_facts", hf_subset="global_facts"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_biology", hf_subset="high_school_biology"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_chemistry", hf_subset="high_school_chemistry"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_computer_science", hf_subset="high_school_computer_science"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_european_history", hf_subset="high_school_european_history"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_geography", hf_subset="high_school_geography"),
-    CustomMMLUEvaluationTask(
-        name="mmlu:high_school_government_and_politics", hf_subset="high_school_government_and_politics"
-    ),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_macroeconomics", hf_subset="high_school_macroeconomics"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_mathematics", hf_subset="high_school_mathematics"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_microeconomics", hf_subset="high_school_microeconomics"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_physics", hf_subset="high_school_physics"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_psychology", hf_subset="high_school_psychology"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_statistics", hf_subset="high_school_statistics"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_us_history", hf_subset="high_school_us_history"),
-    CustomMMLUEvaluationTask(name="mmlu:high_school_world_history", hf_subset="high_school_world_history"),
-    CustomMMLUEvaluationTask(name="mmlu:human_aging", hf_subset="human_aging"),
-    CustomMMLUEvaluationTask(name="mmlu:human_sexuality", hf_subset="human_sexuality"),
-    CustomMMLUEvaluationTask(name="mmlu:international_law", hf_subset="international_law"),
-    CustomMMLUEvaluationTask(name="mmlu:jurisprudence", hf_subset="jurisprudence"),
-    CustomMMLUEvaluationTask(name="mmlu:logical_fallacies", hf_subset="logical_fallacies"),
-    CustomMMLUEvaluationTask(name="mmlu:machine_learning", hf_subset="machine_learning"),
-    CustomMMLUEvaluationTask(name="mmlu:management", hf_subset="management"),
-    CustomMMLUEvaluationTask(name="mmlu:marketing", hf_subset="marketing"),
-    CustomMMLUEvaluationTask(name="mmlu:medical_genetics", hf_subset="medical_genetics"),
-    CustomMMLUEvaluationTask(name="mmlu:miscellaneous", hf_subset="miscellaneous"),
-    CustomMMLUEvaluationTask(name="mmlu:moral_disputes", hf_subset="moral_disputes"),
-    CustomMMLUEvaluationTask(name="mmlu:moral_scenarios", hf_subset="moral_scenarios"),
-    CustomMMLUEvaluationTask(name="mmlu:nutrition", hf_subset="nutrition"),
-    CustomMMLUEvaluationTask(name="mmlu:philosophy", hf_subset="philosophy"),
-    CustomMMLUEvaluationTask(name="mmlu:prehistory", hf_subset="prehistory"),
-    CustomMMLUEvaluationTask(name="mmlu:professional_accounting", hf_subset="professional_accounting"),
-    CustomMMLUEvaluationTask(name="mmlu:professional_law", hf_subset="professional_law"),
-    CustomMMLUEvaluationTask(name="mmlu:professional_medicine", hf_subset="professional_medicine"),
-    CustomMMLUEvaluationTask(name="mmlu:professional_psychology", hf_subset="professional_psychology"),
-    CustomMMLUEvaluationTask(name="mmlu:public_relations", hf_subset="public_relations"),
-    CustomMMLUEvaluationTask(name="mmlu:security_studies", hf_subset="security_studies"),
-    CustomMMLUEvaluationTask(name="mmlu:sociology", hf_subset="sociology"),
-    CustomMMLUEvaluationTask(name="mmlu:us_foreign_policy", hf_subset="us_foreign_policy"),
-    CustomMMLUEvaluationTask(name="mmlu:virology", hf_subset="virology"),
-    CustomMMLUEvaluationTask(name="mmlu:world_religions", hf_subset="world_religions"),
-]
-def mmlu_harness(line, task_name: str = None):
-    topic = line["subject"]
-    prompt = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n"
-    prompt += line["question"] + "\n"
-    prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
-    prompt += "Answer:"
-    gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
-    "__few_shots" in line and line["__few_shots"] is True  # We are adding few shots
-    return Doc(
-        task_name=task_name,
-        query=prompt,
-        choices=[" A", " B", " C", " D"],
-        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
-        gold_index=gold_ix,
-        instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
-    )
-def mmlu_prompt(line, task_name: str = None):
-    """MMLU prompt without letters"""
-    topic = line["subject"]
-    prompt = f"The following are questions about {topic.replace('_', ' ')}.\nQuestion: "
-    prompt += line["question"] + "\nAnswer:"
-    return Doc(
-        task_name=task_name,
-        query=prompt,
-        choices=[f" {c}" for c in line["choices"]],
-        gold_index=line["answer"],
-        instruction=f"The following are questions about {topic.replace('_', ' ')}.\n",
-    )
-# MMLU_STRING = {t: f'custom|{t.name}|5|1' for t in MMLU_TASKS}
-MMLU_STRING = [(t, f"custom|{t.name}|0|1") for t in MMLU_TASKS]
-_TASKS_STRINGS.extend(MMLU_STRING)
-_TASKS += MMLU_TASKS
-## BBH ##
-class CustomBBHEvaluationTask(CustomEvaluationTask):
-    def __init__(
-        self,
-        name,
-        prompt_function="bbh_prompt",
-        hf_repo="lighteval/big_bench_hard",
-        hf_subset=None,
-        metric=[Metrics.exact_match],
-        hf_avail_splits=["train"],
-        evaluation_splits=["train"],
-        few_shots_split="train",
-        few_shots_select=None,
-        suite=None,
-        generation_size=4,
-        stop_sequence=None,
-        output_regex=None,
-        frozen=False,
-    ):
-        super().__init__(
-            name=name,
-            prompt_function=prompt_function,
-            hf_repo=hf_repo,
-            hf_subset=hf_subset,
-            metric=metric,
-            hf_avail_splits=hf_avail_splits,
-            evaluation_splits=evaluation_splits,
-            few_shots_split=few_shots_split,
-            few_shots_select=few_shots_select,
-            suite=suite,
-            generation_size=generation_size,
-            stop_sequence=stop_sequence,
-            output_regex=output_regex,
-            frozen=frozen,
-        )
-BBH_TASKS = [
-    CustomBBHEvaluationTask(name="bbh:boolean_expressions", hf_subset="boolean_expressions"),
-    CustomBBHEvaluationTask(name="bbh:causal_judgement", hf_subset="causal_judgement"),
-    CustomBBHEvaluationTask(name="bbh:date_understanding", hf_subset="date_understanding"),
-    CustomBBHEvaluationTask(name="bbh:disambiguation_qa", hf_subset="disambiguation_qa"),
-    CustomBBHEvaluationTask(name="bbh:dyck_languages", hf_subset="dyck_languages"),
-    CustomBBHEvaluationTask(name="bbh:formal_fallacies", hf_subset="formal_fallacies"),
-    CustomBBHEvaluationTask(name="bbh:geometric_shapes", hf_subset="geometric_shapes"),
-    CustomBBHEvaluationTask(name="bbh:hyperbaton", hf_subset="hyperbaton"),
-    CustomBBHEvaluationTask(name="bbh:logical_deduction_five_objects", hf_subset="logical_deduction_five_objects"),
-    CustomBBHEvaluationTask(name="bbh:logical_deduction_seven_objects", hf_subset="logical_deduction_seven_objects"),
-    CustomBBHEvaluationTask(name="bbh:logical_deduction_three_objects", hf_subset="logical_deduction_three_objects"),
-    CustomBBHEvaluationTask(name="bbh:movie_recommendation", hf_subset="movie_recommendation"),
-    CustomBBHEvaluationTask(name="bbh:multistep_arithmetic_two", hf_subset="multistep_arithmetic_two"),
-    CustomBBHEvaluationTask(name="bbh:navigate", hf_subset="navigate"),
-    CustomBBHEvaluationTask(name="bbh:object_counting", hf_subset="object_counting"),
-    CustomBBHEvaluationTask(name="bbh:penguins_in_a_table", hf_subset="penguins_in_a_table"),
-    CustomBBHEvaluationTask(name="bbh:reasoning_about_colored_objects", hf_subset="reasoning_about_colored_objects"),
-    CustomBBHEvaluationTask(name="bbh:ruin_names", hf_subset="ruin_names"),
-    CustomBBHEvaluationTask(
-        name="bbh:salient_translation_error_detection", hf_subset="salient_translation_error_detection"
-    ),
-    CustomBBHEvaluationTask(name="bbh:snarks", hf_subset="snarks"),
-    CustomBBHEvaluationTask(name="bbh:sports_understanding", hf_subset="sports_understanding"),
-    CustomBBHEvaluationTask(name="bbh:temporal_sequences", hf_subset="temporal_sequences"),
-    CustomBBHEvaluationTask(
-        name="bbh:tracking_shuffled_objects_five_objects", hf_subset="tracking_shuffled_objects_five_objects"
-    ),
-    CustomBBHEvaluationTask(
-        name="bbh:tracking_shuffled_objects_seven_objects", hf_subset="tracking_shuffled_objects_seven_objects"
-    ),
-    CustomBBHEvaluationTask(
-        name="bbh:tracking_shuffled_objects_three_objects", hf_subset="tracking_shuffled_objects_three_objects"
-    ),
-    CustomBBHEvaluationTask(name="bbh:web_of_lies", hf_subset="web_of_lies"),
-    CustomBBHEvaluationTask(name="bbh:word_sorting", hf_subset="word_sorting"),
-]
-def bbh_prompt(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=line["input"] + "\nAnswer: ",
-        choices=[line["target"]],
-        gold_index=0,
-    )
-# BBH_STRING = {t: f'custom|{t.name}|3|1' for t in BBH_TASKS}
-BBH_STRING = [(t, f"custom|{t.name}|0|1") for t in BBH_TASKS]
-_TASKS_STRINGS.extend(BBH_STRING)
-_TASKS += BBH_TASKS
-## AGI eval ##
-class CustomAGIEvalEvaluationTask(CustomEvaluationTask):
-    def __init__(
-        self,
-        name,
-        prompt_function="agi_eval_prompt_no_letters",
-        hf_repo="lighteval/agi_eval_en",
-        hf_subset=None,
-        #  metric=[Metrics.loglikelihood_acc_single_token],
-        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
-        hf_avail_splits=["train", "validation"],
-        evaluation_splits=["train"],
-        few_shots_split="validation",
-        few_shots_select=None,
-        suite=None,
-        generation_size=-1,
-        stop_sequence=None,
-        output_regex=None,
-        frozen=False,
-    ):
-        super().__init__(
-            name=name,
-            prompt_function=prompt_function,
-            hf_repo=hf_repo,
-            hf_subset=hf_subset,
-            metric=metric,
-            hf_avail_splits=hf_avail_splits,
-            evaluation_splits=evaluation_splits,
-            few_shots_split=few_shots_split,
-            few_shots_select=few_shots_select,
-            suite=suite,
-            generation_size=generation_size,
-            stop_sequence=stop_sequence,
-            output_regex=output_regex,
-            frozen=frozen,
-        )
-AGIEVAL_TASKS = [
-    CustomAGIEvalEvaluationTask(name="agi_eval:aqua_rat", hf_subset="aqua_rat"),
-    CustomAGIEvalEvaluationTask(name="agi_eval:logiqa-en", hf_subset="logiqa-en"),
-    CustomAGIEvalEvaluationTask(name="agi_eval:lsat-ar", hf_subset="lsat-ar"),
-    CustomAGIEvalEvaluationTask(name="agi_eval:lsat-lr", hf_subset="lsat-lr"),
-    CustomAGIEvalEvaluationTask(name="agi_eval:lsat-rc", hf_subset="lsat-rc"),
-    CustomAGIEvalEvaluationTask(
-        name="agi_eval:math",
-        hf_subset="math",
-        prompt_function="agi_eval_math_prompt",
-        metric=[Metrics.exact_match, Metrics.quasi_exact_match2],
-        generation_size=40,
-    ),
-    CustomAGIEvalEvaluationTask(name="agi_eval:sat-en", hf_subset="sat-en"),
-    CustomAGIEvalEvaluationTask(name="agi_eval:sat-math", hf_subset="sat-math"),
-]
-def agi_eval_math_prompt(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=line["question"],
-        choices=[line["answer"]],
-        gold_index=0,
-        instruction="",
-    )
-def agi_eval_prompt(line, task_name: str = None):
-    cleaned_options = [o.replace("(", "").replace(")", " ") for o in line["options"]]
-    prompt = "The following are multiple choice questions (with answers).\n\n"
-    prompt += line["question"] + "\n" + "\n".join(cleaned_options) + "\n"
-    prompt += "Answer: "
-    choices = LETTER_INDICES[: len(line["options"])]
-    output = Doc(
-        query=prompt,
-        instruction="The following are multiple choice questions (with answers).\n\n",
-    )
-    if line["label"]:
-        output.choices = choices
-        output.gold_index = LETTER_INDICES.index(line["label"].strip())
-    else:
-        output.choices = [line["answer"]]
-        output.gold_index = 0
-    return output
-def agi_eval_prompt_no_letters(line, task_name: str = None):
-    cleaned_options = [
-        " " + o.replace("(A)", "").replace("(B)", "").replace("(C)", "").replace("(D)", "").replace("(E)", "")
-        for o in line["options"]
-    ]
-    output = Doc(
-        query=line["question"],
-        choices=cleaned_options,
-        gold_index=LETTER_INDICES.index(line["label"].strip()),
-        instruction="",
-    )
-    return output
-# AGIEVAL_STRING = {t: f'custom|{t.name}|5|1' for t in AGIEVAL_TASKS}
-AGIEVAL_STRING = [(t, f"custom|{t.name}|0|1") for t in AGIEVAL_TASKS]
-_TASKS_STRINGS.extend(AGIEVAL_STRING)
-_TASKS += AGIEVAL_TASKS
-## HUMAN EVAL ##
-# human_eval = CustomEvaluationTask(
-#         name="human_eval",
-#         prompt_function="human_eval",
-#         hf_repo="lighteval/human_eval",
-#         metric=["human_eval_pass_at_1"],
-#     ),
-def has_generative_metrics(task: CustomEvaluationTask) -> bool:
-    for metric in task.metric:
-        if metric in NEEDS_GENERATION_ONLY:
-            return True
-    return False
-EARLY_SIGNAL_TASKS = ",".join([t[1] for t in COMMON_SENSE_REASONING_STRING] + [t[1] for t in MMLU_STRING])
-# Convert to dict for lighteval
-TASKS_TABLE = [asdict(task) for task in _TASKS]
-# You can have a few pre-organised groups of tasks
-TASKS_GROUPS = {
-    "all": ",".join(t[1] for t in _TASKS_STRINGS),
-    "early-signal": EARLY_SIGNAL_TASKS,
-    "non-generatives": ",".join(t for k, t in _TASKS_STRINGS if not has_generative_metrics(k)),
-    "generatives": ",".join(t for k, t in _TASKS_STRINGS if has_generative_metrics(k)),
-}
-if __name__ == "__main__":
-    print(t["name"] for t in TASKS_TABLE)
-    print(len(TASKS_TABLE))

custom_evaluation_utils.py DELETED Viewed

@@ -1,158 +0,0 @@
-"""
-Custom evaluation tasks for lighteval
-"""
-from dataclasses import dataclass
-from enum import Enum, auto
-from typing import Optional, Tuple, Union
-class Metrics(Enum):
-    any_target_loglikelihood_acc = auto()
-    bert_score = auto()
-    bias = auto()
-    bits_per_byte = auto()
-    bleu = auto()
-    bleu_1 = auto()
-    bleu_4 = auto()
-    byte_perplexity = auto()
-    chrf = auto()
-    code_eval_APPS = auto()
-    code_eval_HE = auto()
-    copyright = auto()
-    disinformation = auto()
-    exact_match = auto()
-    exact_set_match = auto()
-    extractiveness = auto()
-    f1_from_bags = auto()
-    f1_quasi = auto()
-    f1_sequence = auto()
-    f1_set_match = auto()
-    faithfulness = auto()
-    iou_set_match = auto()
-    log_prob = auto()
-    loglikelihood_acc = auto()
-    loglikelihood_acc_norm = auto()
-    loglikelihood_acc_norm_nospace = auto()
-    loglikelihood_acc_norm_single_token = auto()
-    loglikelihood_acc_single_token = auto()
-    loglikelihood_f1 = auto()
-    loglikelihood_f1_single_token = auto()
-    math_quasi_exact_match = auto()
-    mc_taco = auto()
-    mcc = auto()
-    mcc_single_token = auto()
-    mrr = auto()
-    mrr_single_token = auto()
-    multi_fi_numeric = auto()
-    one_choice_loglikelihood_acc = auto()
-    perfect_exact_match = auto()
-    prediction_perplexity = auto()
-    prefix_exact_match = auto()
-    prefix_quasi_exact_match = auto()
-    quasi_exact_match = auto()
-    ranking = auto()
-    recall_at_1_single_token = auto()
-    recall_at_2_single_token = auto()
-    recall_at_1 = auto()
-    recall_at_2 = auto()
-    rouge = auto()
-    rouge_1 = auto()
-    rouge_2 = auto()
-    rouge_l = auto()
-    target_perplexity = auto()
-    ter = auto()
-    toxicity = auto()
-    truthfulqa_mc_metrics = auto()
-    word_perplexity = auto()
-    def __str__(self):
-        return self.name.replace("_at_", "@")
-NEEDS_GENERATION_ONLY = [
-    "perfect_exact_match",
-    "exact_match",
-    "quasi_exact_match",
-    "quasi_exact_match2",
-    "prefix_exact_match",
-    "prefix_quasi_exact_match",
-    "math_quasi_exact_match",
-    "iou_set_match",
-    "exact_set_match",
-    "f1_sequence",
-    "f1_quasi",
-    "f1_set_match",
-    "f1_from_bags",
-    "chrf",
-    "ter",
-    "rouge",
-    "rouge_1",
-    "rouge_2",
-    "rouge_l",
-    "faithfulness",
-    "extractiveness",
-    "bert_score",
-    "bleu",
-    "bleu_1",
-    "bleu_4",
-    "bias",
-    "toxicity",
-    "code_eval_HE",
-    "code_eval_APPS",
-    "copyright",
-]
-@dataclass(unsafe_hash=True)
-class CustomEvaluationTask:
-    name: str
-    prompt_function: str
-    hf_repo: str
-    hf_subset: str
-    metric: Tuple[Union[str, Metrics]]
-    hf_avail_splits: Optional[Tuple[str]] = None
-    evaluation_splits: Optional[Tuple[str]] = None
-    few_shots_split: Optional[str] = None
-    few_shots_select: Optional[str] = None
-    generation_size: int = -1
-    stop_sequence: Optional[Tuple[str]] = None
-    output_regex: Optional[str] = None
-    frozen: bool = False
-    suite: Optional[Tuple[str]] = None  # we use this to know if we should use a custom lighteval or bigcode task
-    def __post_init__(self):
-        self.metric = [str(m) for m in self.metric]
-        if self.suite is None:
-            self.suite = ["custom"]
-        if self.hf_avail_splits is None:
-            self.hf_avail_splits = ["train", "validation", "test"]
-        if self.evaluation_splits is None:
-            self.evaluation_splits = ["validation"]
-        if self.stop_sequence is None:
-            self.stop_sequence = ["\n"]
-        # Convert list to tuple for hashing
-        self.metric = tuple(self.metric)
-        self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
-        self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
-        self.suite = tuple(self.suite) if self.suite else None
-        self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
-@dataclass(unsafe_hash=True)
-class BigCodeEvaluationTask:
-    name: str
-    bigcode_task: str
-    bigcode_task_kwargs: Optional[dict] = None
-    n_samples: int = 1
-    prefix: Optional[str] = None
-    suite: Tuple[str] = None
-    def __post_init__(self):
-        if self.suite is None:
-            self.suite = ("bigcode",)
-        # Convert list to tuple for hashing
-        self.suite = tuple(self.suite)

lighteval_eval_config.yaml CHANGED Viewed

@@ -1,10 +1,5 @@
-checkpoints: null
-data: null
-experiment_logger: null
-general: null
-kill_switch_path: null
 lighteval:
-  batch_size: 24
   checkpoints_path: null
   generation: null
   logging:
@@ -17,29 +12,20 @@ lighteval:
     push_results_to_tensorboard: true
     tensorboard_metric_prefix: e
   parallelism:
-    dp: 4
     pp: 1
     pp_engine: 1f1b
     recompute_granularity: null
-    tp: 2
     tp_linear_async_communication: false
     tp_mode: ALL_REDUCE
-  slurm: null
   slurm_script_dir: null
   slurm_template: null
   tasks:
-    custom_tasks_file: ./custom_evaluation_tasks.py
     dataset_loading_processes: 8
-    max_samples: 1000
     multichoice_continuations_start_space: null
     no_multichoice_continuations_start_space: null
     num_fewshot_seeds: null
-    tasks: early-signal
-logging: null
-model: null
-optimizer: null
-parallelism: null
-profiler: null
-s3_upload: null
-tokenizer: null
-tokens: null

 lighteval:
+  batch_size: 4
   checkpoints_path: null
   generation: null
   logging:
     push_results_to_tensorboard: true
     tensorboard_metric_prefix: e
   parallelism:
+    dp: 8
     pp: 1
     pp_engine: 1f1b
     recompute_granularity: null
+    tp: 1
     tp_linear_async_communication: false
     tp_mode: ALL_REDUCE
   slurm_script_dir: null
   slurm_template: null
   tasks:
+    custom_tasks: brrr.lighteval.custom_tasks
     dataset_loading_processes: 8
+    max_samples: 10000
     multichoice_continuations_start_space: null
     no_multichoice_continuations_start_space: null
     num_fewshot_seeds: null
+    tasks: open-llm-leaderboard

modeling_mistral.py CHANGED Viewed

@@ -106,7 +106,7 @@ class RotaryEmbedding(nn.Module):
             self.end *= 2
             self._initialized_buffer = False
         if self._initialized_buffer is False:
-            print(f"Initializing rotary embeddings with end={self.end}")
             self.init_rotary_embeddings()
         dtype = x.dtype
         assert inner_dim % 2 == 0
@@ -397,7 +397,6 @@ class CausalSelfAttention(nn.Module, AttachableStore):
             # Double check that we use store only at inference time
             assert key_states.requires_grad is False
             assert value_states.requires_grad is False
-            print("Using store")
             if "position_offsets" in store:
                 old_position_offsets = store["position_offsets"]
                 position_ids = old_position_offsets[:, None] + sequence_mask

             self.end *= 2
             self._initialized_buffer = False
         if self._initialized_buffer is False:
+            # print(f"Initializing rotary embeddings with end={self.end}")
             self.init_rotary_embeddings()
         dtype = x.dtype
         assert inner_dim % 2 == 0
             # Double check that we use store only at inference time
             assert key_states.requires_grad is False
             assert value_states.requires_grad is False
             if "position_offsets" in store:
                 old_position_offsets = store["position_offsets"]
                 position_ids = old_position_offsets[:, None] + sequence_mask

pretrained/Mistral-7B-v0.1/config.yaml CHANGED Viewed

@@ -7,7 +7,7 @@ general:
   project: mistralai
   run: Mistral-7B-v0.1
   seed: 42
-  step: null
 logging: null
 model:
   ddp_bucket_cap_mb: 25

   project: mistralai
   run: Mistral-7B-v0.1
   seed: 42
+  step: 0
 logging: null
 model:
   ddp_bucket_cap_mb: 25

run_evals.py CHANGED Viewed

@@ -10,46 +10,12 @@ torchrun --nproc_per_node=8 run_evals.py --checkpoint-config-path ./pretrained/M
 """
 # flake8: noqa: C901
 import argparse
-import os
-import random
-import time
-from dataclasses import asdict
-from pathlib import Path
-import numpy as np
-import torch
-from huggingface_hub import HFSummaryWriter
-from lighteval.evaluator import evaluate, make_results_table
-from lighteval.logging.evaluation_tracker import EvaluationTracker
-from lighteval.logging.hierarchical_logger import hlog, htrack, htrack_block
-from lighteval.logging.info_loggers import (
-    DetailsLogger,
-)
-from lighteval.models.model_loader import ModelInfo
-from lighteval.tasks.lighteval_task import LightevalTask, create_requests_from_tasks
-from lighteval.tasks.registry import Registry, get_custom_tasks, taskinfo_selector
-from nanotron import distributed as dist
-from nanotron import logging
-from nanotron.config import get_config_from_file
-from nanotron.logging import get_logger, log_rank
-from nanotron.parallel.context import ParallelContext
-from nanotron.utils import local_ranks_zero_first
-from brrr.config import BrrrConfig
-from brrr.experiment_loggers import flatten_dict, obj_to_markdown
-from brrr.s3_checkpoints import fs_copy
-from brrr.utils import check_env
-from lighteval.models.brrr_models import BRRRModel
 from modeling_mistral import MistralForTraining
 from config_mistral import MistralConfig
-logger = get_logger(__name__)
-TOKEN = os.getenv("HF_TOKEN")
-CACHE_DIR = os.getenv("HF_HOME", "/scratch")
 def get_parser():
     parser = argparse.ArgumentParser()
@@ -69,374 +35,25 @@ def get_parser():
         type=str,
         help="Local or hub path of an optional tokenizer (if not indicated in the checkpoint)",
     )
-    parser.add_argument(
-        "--s5cmd-path",
-        type=str,
-        default="/admin/home/thomwolf/miniconda3/envs/b4r/bin/s5cmd",
-        help="Path to s5cmd install",
-    )
-    parser.add_argument(
-        "--s5cmd-numworkers",
-        type=int,
-        default=64,
-        help="s5cmd num workers (optional)",
-    )
-    parser.add_argument(
-        "--s5cmd-concurrency",
-        type=int,
-        default=10,
-        help="s5cmd concurrency (optional)",
-    )
     parser.add_argument(
         "--cache-dir",
         type=str,
-        default="",
         help="Cache directory",
     )
     return parser
-def push_results_to_wandb(  # noqa: C901
-    config: BrrrConfig, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
-):
-    # config: BrrrConfig = get_config_from_dict(config, config_class=BrrrConfig)
-    lighteval_config = config.lighteval
-    try:
-        global_step = config.general.step
-    except ValueError:
-        global_step = 0
-    if config.lighteval.logging.tensorboard_metric_prefix is not None:
-        prefix = config.lighteval.logging.tensorboard_metric_prefix
-    else:
-        prefix = "eval"
-    output_dir_tb = Path(lighteval_config.logging.local_output_path) / "tb" / (config.general.run + "_" + prefix)
-    output_dir_tb.mkdir(parents=True, exist_ok=True)
-    os.environ["WANDB_DISABLE_SERVICE"] = "True"
-    import wandb
-    wandb.tensorboard.patch(root_logdir=config.lighteval.logging.local_output_path)
-    hlog("Starting wandb with WANDB_DISABLE_SERVICE=True")
-    wandb.init(
-        project=config.lighteval.wandb.wandb_project,
-        entity=config.lighteval.wandb.wandb_entity,
-        name=config.lighteval.wandb.wandb_run_name,
-        config=config.as_dict(),
-        # sync_tensorboard=True,
-        resume=True,
-    )
-    wb_dict = {}
-    bench_averages = {}
-    for name, values in results.items():
-        splited_name = name.split("|")
-        if len(splited_name) == 3:
-            _, task_name, _ = splited_name
-        else:
-            task_name = name
-        bench_suite = None
-        if ":" in task_name:
-            bench_suite = task_name.split(":")[0]  # e.g. MMLU
-            hlog(f"bench_suite {bench_suite} in {task_name}")
-            for metric, value in values.items():
-                if "stderr" in metric:
-                    continue
-                if bench_suite not in bench_averages:
-                    bench_averages[bench_suite] = {}
-                bench_averages[bench_suite][metric] = bench_averages[bench_suite].get(metric, []) + [float(value)]
-        hlog(f"Pushing {task_name} {values} to tensorboard")
-        for metric, value in values.items():
-            if "stderr" in metric:
-                wb_dict[f"stderr_{metric}/{task_name}"] = value
-            elif bench_suite is not None:
-                wb_dict[f"{bench_suite}-{metric}/{task_name}"] = value
-            else:
-                wb_dict[f"{metric}/{task_name}"] = value
-    # e.g. MMLU
-    for name, values in bench_averages.items():
-        for metric, values in values.items():
-            hlog(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard")
-            wb_dict[f"{metric}/{name}"] = sum(values) / len(values)
-    for task_name, task_details in details.items():
-        if len(task_details) <= 1:
-            continue
-        columns = list(flatten_dict(asdict(task_details[0])).keys())
-        table = wandb.Table(columns=columns)
-        table.add_data(*[str(v) for v in flatten_dict(asdict(task_details[0])).values()])
-        table.add_data(*[str(v) for v in flatten_dict(asdict(task_details[1])).values()])
-        wandb.log({f"eval_details_{task_name}": table}, step=global_step, commit=False)
-    wandb.log(dict(wb_dict.items()), step=global_step, commit=True)
-    # tb_context.add_text("eval_sizes", obj_to_markdown(sizes), global_step=global_step)
-    # We are doing parallel evaluations of multiple checkpoints and recording the steps not in order
-    # This messes up with tensorboard, so the easiest is to rename files in the order of the checkpoints
-    # See: https://github.com/tensorflow/tensorboard/issues/5958
-    # But tensorboardX don't let us control the prefix of the files (only the suffix), so we need to do it ourselves before commiting the files
-    hlog(f"Pushed to wandb" f" at {output_dir_tb} and global_step {global_step}")
-def push_results_to_tensorboard(  # noqa: C901
-    config: BrrrConfig, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
-):
-    # config: BrrrConfig = get_config_from_dict(config, config_class=BrrrConfig)
-    lighteval_config = config.lighteval
-    try:
-        global_step = config.general.step
-    except ValueError:
-        global_step = 0
-    if config.lighteval.logging.tensorboard_metric_prefix is not None:
-        prefix = config.lighteval.logging.tensorboard_metric_prefix
-    else:
-        prefix = "eval"
-    output_dir_tb = Path(lighteval_config.logging.local_output_path) / "tb" / (config.general.run + "_" + prefix)
-    output_dir_tb.mkdir(parents=True, exist_ok=True)
-    tb_context = HFSummaryWriter(
-        logdir=str(output_dir_tb),
-        repo_id=lighteval_config.logging.hub_repo_tensorboard,
-        repo_private=True,
-        path_in_repo="tb",
-        commit_every=6000,  # Very long time so that we can change our files names and trigger push ourselves (see below)
-    )
-    bench_averages = {}
-    for name, values in results.items():
-        splited_name = name.split("|")
-        if len(splited_name) == 3:
-            _, task_name, _ = splited_name
-        else:
-            task_name = name
-        bench_suite = None
-        if ":" in task_name:
-            bench_suite = task_name.split(":")[0]  # e.g. MMLU
-            hlog(f"bench_suite {bench_suite} in {task_name}")
-            for metric, value in values.items():
-                if "stderr" in metric:
-                    continue
-                if bench_suite not in bench_averages:
-                    bench_averages[bench_suite] = {}
-                bench_averages[bench_suite][metric] = bench_averages[bench_suite].get(metric, []) + [float(value)]
-        hlog(f"Pushing {task_name} {values} to tensorboard")
-        for metric, value in values.items():
-            if "stderr" in metric:
-                tb_context.add_scalar(f"stderr_{prefix}/{task_name}/{metric}", value, global_step=global_step)
-            elif bench_suite is not None:
-                tb_context.add_scalar(f"{prefix}_{bench_suite}/{task_name}/{metric}", value, global_step=global_step)
-            else:
-                tb_context.add_scalar(f"{prefix}/{task_name}/{metric}", value, global_step=global_step)
-    # e.g. MMLU
-    for name, values in bench_averages.items():
-        for metric, values in values.items():
-            hlog(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard")
-            tb_context.add_scalar(f"{prefix}/{name}/{metric}", sum(values) / len(values), global_step=global_step)
-    tb_context.add_text("eval_config", obj_to_markdown(results), global_step=global_step)
-    # tb_context.add_text("eval_sizes", obj_to_markdown(sizes), global_step=global_step)
-    for task_name, task_details in details.items():
-        tb_context.add_text(
-            f"eval_details_{task_name}",
-            obj_to_markdown({"0": task_details[0], "1": task_details[1] if len(task_details) > 1 else {}}),
-            global_step=global_step,
-        )
-    # We are doing parallel evaluations of multiple checkpoints and recording the steps not in order
-    # This messes up with tensorboard, so the easiest is to rename files in the order of the checkpoints
-    # See: https://github.com/tensorflow/tensorboard/issues/5958
-    # But tensorboardX don't let us control the prefix of the files (only the suffix), so we need to do it ourselves before commiting the files
-    tb_context.close()  # flushes the unfinished write operations
-    time.sleep(5)
-    files = os.listdir(output_dir_tb)
-    for file in files:
-        os.rename(os.path.join(output_dir_tb, file), os.path.join(output_dir_tb, f"{global_step:07d}_{file}"))
-    # Now we can push to the hub
-    tb_context.scheduler.trigger()
-    hlog(
-        f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/"
-        f" at {output_dir_tb} and global_step {global_step}"
-    )
-@htrack()
-def main(args):
-    cache_dir = args.cache_dir or CACHE_DIR
-    check_env()
-    dist.initialize_torch_distributed()
-    with htrack_block("get config"):
-        if not args.checkpoint_config_path.endswith(".yaml"):
-            raise ValueError("The checkpoint path should point to a YAML file")
-        local_config_path = args.checkpoint_config_path
-        if args.checkpoint_config_path.startswith("s3:/"):
-            local_config_path = args.checkpoint_config_path.replace("s3:/", cache_dir)
-            with local_ranks_zero_first():
-                if os.environ.get("LOCAL_RANK", None) == "0":
-                    os.makedirs(os.path.dirname(local_config_path), exist_ok=True)
-                    fs_copy(args.checkpoint_config_path, local_config_path)
-        brrr_config: BrrrConfig = get_config_from_file(local_config_path, config_class=BrrrConfig, model_config_class=MistralConfig)
-        if args.lighteval_override:
-            local_override_path = args.lighteval_override.replace("s3:/", cache_dir)
-            if args.lighteval_override.startswith("s3:/"):
-                local_override_path = args.lighteval_override.replace("s3:/", cache_dir)
-                with local_ranks_zero_first():
-                    if os.environ.get("LOCAL_RANK", None) == "0":
-                        os.makedirs(os.path.dirname(local_override_path), exist_ok=True)
-                        fs_copy(args.lighteval_override, local_override_path)
-            lighteval_brrr_config: BrrrConfig = get_config_from_file(local_override_path, config_class=BrrrConfig)
-            lighteval_config = lighteval_brrr_config.lighteval
-            brrr_config.lighteval = lighteval_config
-        else:
-            local_override_path = ""
-            lighteval_config = brrr_config.lighteval
-        parallel_context = ParallelContext(
-            tensor_parallel_size=lighteval_config.parallelism.tp,
-            pipeline_parallel_size=lighteval_config.parallelism.pp,
-            data_parallel_size=lighteval_config.parallelism.dp,
-        )
-        evaluation_tracker = EvaluationTracker(token=TOKEN)
-        evaluation_tracker.general_config_logger.log_args_info(
-            num_fewshot_seeds=1,
-            override_batch_size=None,
-            max_samples=lighteval_config.tasks.max_samples,
-            job_id=os.environ.get("SLURM_JOB_ID", None),
-            config=brrr_config.as_dict(),
-        )
-    with htrack_block("Test all gather"):
-        hlog("Test gather tensor")
-        # Do a first NCCL sync to warmup and try to avoid Timeout after model/data loading
-        log_rank(
-            f"[TEST] Running NCCL sync for ranks {list(range(parallel_context.world_pg.size()))}",
-            logger=logger,
-            level=logging.WARNING,
-            group=parallel_context.dp_pg,
-            rank=0,
-        )
-        test_tensor = torch.tensor([dist.get_rank(parallel_context.world_pg)], device=torch.device("cuda"))
-        test_tensor_list = [torch.zeros_like(test_tensor) for _ in range(parallel_context.world_pg.size())]
-        dist.all_gather(test_tensor_list, test_tensor, group=parallel_context.world_pg, async_op=False)
-        dist.barrier()
-        log_rank(
-            f"[TEST] NCCL sync for ranks {[t.item() for t in test_tensor_list]}",
-            logger=logger,
-            level=logging.WARNING,
-            group=parallel_context.dp_pg,
-            rank=0,
-        )
-        del test_tensor_list
-        del test_tensor
-    with htrack_block("Model loading"):
-        # We need to load the model in the main process first to avoid downloading the model multiple times
-        model = BRRRModel(
-            checkpoint_path=args.checkpoint_config_path.replace("config.yaml", ""),
-            model_args=brrr_config.model,
-            tokenizer=brrr_config.tokenizer,
-            parallel_context=parallel_context,
-            parallel_config=lighteval_config.parallelism,
-            lighteval_config=lighteval_config,
-            batch_size=lighteval_config.batch_size,
-            cache_dir=os.environ.get("HF_HOME", "/scratch"),
-            debug_one_layer_model=False,
-            s5cmd_path=args.s5cmd_path,
-            s5cmd_numworkers=args.s5cmd_numworkers,
-            s5cmd_concurrency=args.s5cmd_concurrency,
-            model_class=MistralForTraining
-        )
-        model_info = ModelInfo(model_name=f"{brrr_config.general.run}/{brrr_config.general.step}")
-        evaluation_tracker.general_config_logger.log_model_info(model_info)
-    with htrack_block("Tasks loading"):
-        with local_ranks_zero_first():
-            tasks_selection = lighteval_config.tasks.tasks
-            if lighteval_config.tasks.custom_tasks_file:
-                _, tasks_groups_dict = get_custom_tasks(lighteval_config.tasks.custom_tasks_file)
-                if tasks_groups_dict and lighteval_config.tasks.tasks in tasks_groups_dict:
-                    tasks_selection = tasks_groups_dict[lighteval_config.tasks.tasks]
-            task_names_list, few_shots_dict = taskinfo_selector(tasks_selection)
-            task_dict = Registry(cache_dir=cache_dir).get_task_dict(
-                task_names_list, custom_tasks_file=lighteval_config.tasks.custom_tasks_file
-            )
-            # Loading all the dataset in a distributed manner
-            LightevalTask.load_datasets(task_dict.values(), lighteval_config.tasks.dataset_loading_processes)
-            evaluation_tracker.task_config_logger.log(task_dict)
-            hlog("Loading documents, and requests")
-            requests, docs = create_requests_from_tasks(
-                task_dict=task_dict,
-                fewshot_dict=few_shots_dict,
-                num_fewshot_seeds=lighteval_config.tasks.num_fewshot_seeds or 1,
-                lm=model,
-                max_samples=lighteval_config.tasks.max_samples,
-                evaluation_tracker=evaluation_tracker,
-                use_chat_template=False
-            )
-    with htrack_block("Setting seeds and waiting for all processes"):
-        hlog(f"setting seed to {1234} for random and numpy")
-        random.seed(1234)
-        np.random.seed(1234)
-        dist.barrier()
-    with htrack_block("Evaluation"):
-        hlog(f"Evaluate on {len(task_names_list)} tasks.")
-        evaluation_tracker = evaluate(
-            lm=model,
-            requests_dict=requests,
-            docs=docs,
-            task_dict=task_dict,
-            override_bs=lighteval_config.batch_size,
-            evaluation_tracker=evaluation_tracker,
-        )
-    if dist.get_rank(parallel_context.world_pg) == 0:
-        with htrack_block("Compiling and saving results"):
-            evaluation_tracker.general_config_logger.log_end_time()
-            evaluation_tracker.metrics_logger.aggregate(task_dict=task_dict, bootstrap_iters=1000)
-            evaluation_tracker.details_logger.aggregate()
-            if lighteval_config.logging.local_output_path:
-                evaluation_tracker.save(
-                    output_dir=lighteval_config.logging.local_output_path,
-                    push_results_to_hub=lighteval_config.logging.push_results_to_hub,
-                    push_details_to_hub=lighteval_config.logging.push_details_to_hub,
-                    public=False,
-                    push_results_to_tensorboard=lighteval_config.logging.push_results_to_tensorboard,
-                )
-            if lighteval_config.logging.push_results_to_tensorboard:
-                push_results_to_tensorboard(
-                    config=brrr_config,
-                    results=evaluation_tracker.metrics_logger.metric_aggregated,
-                    details=evaluation_tracker.details_logger.details,
-                )
-            if lighteval_config.wandb is not None:
-                push_results_to_wandb(
-                    config=brrr_config,
-                    results=evaluation_tracker.metrics_logger.metric_aggregated,
-                    details=evaluation_tracker.details_logger.details,
-                )
-            final_dict = evaluation_tracker.generate_final_dict()
-        hlog(make_results_table(final_dict))
-        return final_dict
 if __name__ == "__main__":
     parser = get_parser()
     args, unknowns = parser.parse_known_args()
-    main(args)

 """
 # flake8: noqa: C901
 import argparse
+from nanotron.config import Config
 from modeling_mistral import MistralForTraining
 from config_mistral import MistralConfig
+from lighteval.main_nanotron import main
 def get_parser():
     parser = argparse.ArgumentParser()
         type=str,
         help="Local or hub path of an optional tokenizer (if not indicated in the checkpoint)",
     )
     parser.add_argument(
         "--cache-dir",
         type=str,
+        default=None,
         help="Cache directory",
     )
     return parser
 if __name__ == "__main__":
     parser = get_parser()
     args, unknowns = parser.parse_known_args()
+    main(
+        args.checkpoint_config_path,
+        args.lighteval_override,
+        args.cache_dir,
+        config_cls=Config,
+        model_config_cls=MistralConfig,
+        model_cls=MistralForTraining
+    )

run_generate.py CHANGED Viewed

@@ -35,9 +35,8 @@ from nanotron.random import (
 from nanotron.serialize import (
     load_weights,
 )
-from nanotron.trainer import CONFIG_TO_MODEL_CLASS, mark_tied_parameters
-from brrr.config import BrrrConfig
 from config_mistral_7b import MistralConfig
 from modeling_mistral import MistralForTraining
@@ -64,7 +63,7 @@ def main():
     assert args.ckpt_path.exists(), f"Checkpoint path {args.ckpt_path} does not exist"
-    config = get_config_from_file((args.ckpt_path / "config.yaml").as_posix(), config_class=BrrrConfig, model_config_class=MistralConfig)
     model_config = config.model.model_config
     tokenizer_path = config.tokenizer.tokenizer_name_or_path

 from nanotron.serialize import (
     load_weights,
 )
+from nanotron.trainer import mark_tied_parameters
 from config_mistral_7b import MistralConfig
 from modeling_mistral import MistralForTraining
     assert args.ckpt_path.exists(), f"Checkpoint path {args.ckpt_path} does not exist"
+    config = get_config_from_file((args.ckpt_path / "config.yaml").as_posix(), model_config_class=MistralConfig, skip_unused_config_keys=True)
     model_config = config.model.model_config
     tokenizer_path = config.tokenizer.tokenizer_name_or_path