Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import torch | |
from dataclasses import dataclass | |
from enum import Enum | |
from src.envs import CACHE_PATH | |
class Task: | |
benchmark: str | |
metric: str | |
col_name: str | |
num_fewshot: int | |
class Tasks(Enum): | |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard | |
# task0 = Task("anli_r1", "acc", "ANLI") | |
# task1 = Task("logiqa", "acc_norm", "LogiQA") | |
task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper | |
task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper | |
# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf | |
# task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0) | |
task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0) | |
task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0) | |
task5 = Task("halueval_qa", "acc", "HaluEval QA", 0) | |
task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0) | |
task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0) | |
task8 = Task("xsum", "rougeL", "XSum", 2) | |
task9 = Task("cnndm", "rougeL", "CNN/DM", 2) | |
task10 = Task("memo-trap", "acc", "memo-trap", 0) | |
task11 = Task("nq8", "em", "NQ Open 8", 8) | |
task12 = Task("tqa8", "em", "TriviaQA 8", 8) | |
task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0) | |
task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0) | |
task15 = Task("fever10", "acc", "FEVER", 16) | |
task16 = Task("squadv2", "exact", "SQuADv2", 4) | |
task17 = Task("truefalse_cieacf", "acc", "TrueFalse", 8) | |
# NUM_FEWSHOT = 64 # Change with your few shot | |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk") | |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk") | |
DEVICE = "cuda" if torch.cuda.is_available() else 'cpu' | |
LIMIT = None # Testing; needs to be None | |