Update src/backend/run_eval_suite.py
Browse files
src/backend/run_eval_suite.py
CHANGED
@@ -15,16 +15,14 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
|
|
15 |
print(
|
16 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
17 |
)
|
18 |
-
|
19 |
-
task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
|
20 |
|
21 |
print(f"Selected Tasks: {task_names}")
|
22 |
-
|
23 |
results = evaluator.simple_evaluate(
|
24 |
model="hf-causal-experimental", # "hf-causal"
|
25 |
model_args=eval_request.get_model_args(),
|
26 |
tasks=task_names,
|
27 |
-
num_fewshot=num_fewshot,
|
28 |
batch_size=batch_size,
|
29 |
device=device,
|
30 |
no_cache=no_cache,
|
@@ -54,4 +52,4 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
|
|
54 |
repo_type="dataset",
|
55 |
)
|
56 |
|
57 |
-
return results
|
|
|
15 |
print(
|
16 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
17 |
)
|
18 |
+
task_names = ["medmcqa", "medqa_4options", "mmlu_anatomy", "mmlu_clinical_knowledge", "mmlu_college_biology", "mmlu_college_medicine", "mmlu_medical_genetics", "mmlu_professional_medicine", "pubmedqa"]
|
|
|
19 |
|
20 |
print(f"Selected Tasks: {task_names}")
|
|
|
21 |
results = evaluator.simple_evaluate(
|
22 |
model="hf-causal-experimental", # "hf-causal"
|
23 |
model_args=eval_request.get_model_args(),
|
24 |
tasks=task_names,
|
25 |
+
# num_fewshot=num_fewshot,
|
26 |
batch_size=batch_size,
|
27 |
device=device,
|
28 |
no_cache=no_cache,
|
|
|
52 |
repo_type="dataset",
|
53 |
)
|
54 |
|
55 |
+
return results
|