{ "results": { "b4b": { "acc,none": 0.7882244143033293, "acc_stderr,none": 0.08841138813945006, "acc_norm,none": 0.7882244143033293, "acc_norm_stderr,none": 0.08841138813945006, "alias": "b4b" }, "b4bqa": { "acc,none": 0.8772321428571429, "acc_stderr,none": 0.007754464516034243, "acc_norm,none": 0.8772321428571429, "acc_norm_stderr,none": 0.007754464516034243, "alias": " - b4bqa" }, "medmcqa_g2b": { "acc,none": 0.617816091954023, "acc_stderr,none": 0.026085614333362674, "acc_norm,none": 0.617816091954023, "acc_norm_stderr,none": 0.026085614333362674, "alias": " - medmcqa_g2b" }, "medmcqa_orig_filtered": { "acc,none": 0.7040229885057471, "acc_stderr,none": 0.024505167376090542, "acc_norm,none": 0.7040229885057471, "acc_norm_stderr,none": 0.024505167376090542, "alias": " - medmcqa_orig_filtered" }, "medqa_4options_g2b": { "acc,none": 0.6746031746031746, "acc_stderr,none": 0.024130158299762613, "acc_norm,none": 0.6746031746031746, "acc_norm_stderr,none": 0.024130158299762613, "alias": " - medqa_4options_g2b" }, "medqa_4options_orig_filtered": { "acc,none": 0.7142857142857143, "acc_stderr,none": 0.023266512213730585, "acc_norm,none": 0.7142857142857143, "acc_norm_stderr,none": 0.023266512213730585, "alias": " - medqa_4options_orig_filtered" } }, "groups": { "b4b": { "acc,none": 0.7882244143033293, "acc_stderr,none": 0.08841138813945006, "acc_norm,none": 0.7882244143033293, "acc_norm_stderr,none": 0.08841138813945006, "alias": "b4b" } }, "configs": { "b4bqa": { "task": "b4bqa", "dataset_path": "AIM-Harvard/b4b_drug_qa", "test_split": "test", "doc_to_text": "", "doc_to_target": "correct_choice", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false }, "medmcqa_g2b": { "task": "medmcqa_g2b", "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand", "training_split": "train", "validation_split": "validation", "test_split": "validation", "doc_to_text": "", "doc_to_target": "cop", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "{{question}}" }, "medmcqa_orig_filtered": { "task": "medmcqa_orig_filtered", "dataset_path": "AIM-Harvard/medmcqa_original", "training_split": "train", "validation_split": "validation", "test_split": "validation", "doc_to_text": "", "doc_to_target": "cop", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "{{question}}" }, "medqa_4options_g2b": { "task": "medqa_4options_g2b", "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "", "doc_to_target": "", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false }, "medqa_4options_orig_filtered": { "task": "medqa_4options_orig_filtered", "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "", "doc_to_target": "", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false } }, "versions": { "b4b": "N/A", "b4bqa": "Yaml", "medmcqa_g2b": "Yaml", "medmcqa_orig_filtered": "Yaml", "medqa_4options_g2b": "Yaml", "medqa_4options_orig_filtered": "Yaml" }, "n-shot": { "b4b": 0, "b4bqa": 0, "medmcqa_g2b": 0, "medmcqa_orig_filtered": 0, "medqa_4options_g2b": 0, "medqa_4options_orig_filtered": 0 }, "config": { "model": "hf", "model_args": "pretrained=mistralai/Mixtral-8x22B-v0.1,parallelize=True,load_in_4bit=True", "batch_size": "auto", "batch_sizes": [ 32 ], "device": null, "use_cache": null, "limit": null, "bootstrap_iters": 100000, "gen_kwargs": null }, "git_hash": "928c7657" }