pingnie commited on
Commit
fdb7c69
1 Parent(s): b58e6fa

change model type and sync with open llm leaderboard on model type

Browse files
app.py CHANGED
@@ -56,14 +56,21 @@ def restart_space():
56
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
57
 
58
 
59
- def init_space():
60
  dataset_df = get_dataset_summary_table(file_path='blog/Hallucination-Leaderboard-Summary.csv')
61
 
62
  if socket.gethostname() not in {'neuromancer'}:
63
- ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
64
- ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
65
-
66
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
 
 
 
 
 
67
 
68
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
69
  return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
 
56
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
57
 
58
 
59
+ def init_space(update_model_type_with_open_llm=True):
60
  dataset_df = get_dataset_summary_table(file_path='blog/Hallucination-Leaderboard-Summary.csv')
61
 
62
  if socket.gethostname() not in {'neuromancer'}:
63
+ # sync model_type with open-llm-leaderboard
64
+ ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
65
+ ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
66
+ # if EVAL_REQUESTS_PATH_OPEN_LLM == '' then we will not update model_type with open-llm-leaderbaord
67
+ if update_model_type_with_open_llm:
68
+ from src.envs import EVAL_REQUESTS_PATH_OPEN_LLM, QUEUE_REPO_OPEN_LLM
69
+ ui_snapshot_download(repo_id=QUEUE_REPO_OPEN_LLM, local_dir=EVAL_REQUESTS_PATH_OPEN_LLM, repo_type="dataset", tqdm_class=None, etag_timeout=30)
70
+ else:
71
+ EVAL_REQUESTS_PATH_OPEN_LLM = ""
72
+
73
+ raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, EVAL_REQUESTS_PATH_OPEN_LLM, COLS, BENCHMARK_COLS)
74
 
75
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
76
  return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
src/display/about.py CHANGED
@@ -5,7 +5,7 @@ TITLE = """<h1 align="center" id="space-title">Hallucinations Leaderboard</h1>""
5
  INTRODUCTION_TEXT = """
6
  📐 The Hallucinations Leaderboard aims to track, rank and evaluate hallucinations in LLMs.
7
 
8
- It evaluates the propensity for hallucination in Large Language Models (LLMs) across a diverse array of tasks, including Closed-book Open-domain QA, Summarization, Reading Comprehension, Instruction Following, Fact-Checking, Hallucination Detection, and Self-Consistency. The evaluation encompasses a wide range of datasets such as NQ Open, TriviaQA, TruthfulQA, XSum, CNN/DM, RACE, SQuADv2, MemoTrap, IFEval, FEVER, FaithDial, True-False, HaluEval, and SelfCheckGPT, offering a comprehensive assessment of each model's performance in generating accurate and contextually relevant content.
9
 
10
  A more detailed explanation of the definition of hallucination and the leaderboard's motivation, tasks and dataset can be found on the "About" page and [The Hallucinations Leaderboard blog post](https://huggingface.co/blog/leaderboards-on-the-hub-hallucinations).
11
 
@@ -74,7 +74,7 @@ To reproduce our results, here is the commands you can run, using [this script](
74
 
75
  Alternatively, if you're interested in evaluating a specific task with a particular model, you can use the [EleutherAI LLM Evaluation Harness library](https://github.com/EleutherAI/lm-evaluation-harness/) as follows:
76
  `python main.py --model=hf-auto --model_args="pretrained=<your_model>,revision=<your_model_revision>,parallelize=True"`
77
- ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=1 --output_path=<output_path>`
78
 
79
  Note that the Hallucinations Library includes several tasks definitions that are not included in the Harness library -- you can find them at [this link](https://huggingface.co/spaces/hallucinations-leaderboard/leaderboard/tree/main/src/backend/tasks)).
80
 
@@ -108,8 +108,9 @@ For all these evaluations, a higher score is a better score.
108
  - {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
109
  - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
110
  Specific fine-tune subcategories (more adapted to chat):
111
- - {ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
112
- - {ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
 
113
  If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
114
  """
115
 
 
5
  INTRODUCTION_TEXT = """
6
  📐 The Hallucinations Leaderboard aims to track, rank and evaluate hallucinations in LLMs.
7
 
8
+ It evaluates the propensity for hallucination in Large Language Models (LLMs) across a diverse array of tasks, including Closed-book Open-domain QA, Summarization, Reading Comprehension, Instruction Following, Fact-Checking, Hallucination Detection, and Self-Consistency. The evaluation encompasses a wide range of datasets such as NQ Open, TriviaQA, TruthfulQA, XSum, CNN/DM, RACE, SQuADv2, MemoTrap, IFEval, FEVER, FaithDial, True-False, HaluEval, and SelfCheckGPT, offering a comprehensive assessment of each model's performance in generating accurate and contextually relevant content.
9
 
10
  A more detailed explanation of the definition of hallucination and the leaderboard's motivation, tasks and dataset can be found on the "About" page and [The Hallucinations Leaderboard blog post](https://huggingface.co/blog/leaderboards-on-the-hub-hallucinations).
11
 
 
74
 
75
  Alternatively, if you're interested in evaluating a specific task with a particular model, you can use the [EleutherAI LLM Evaluation Harness library](https://github.com/EleutherAI/lm-evaluation-harness/) as follows:
76
  `python main.py --model=hf-auto --model_args="pretrained=<your_model>,revision=<your_model_revision>,parallelize=True"`
77
+ ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=1 --output_path=<output_path>`
78
 
79
  Note that the Hallucinations Library includes several tasks definitions that are not included in the Harness library -- you can find them at [this link](https://huggingface.co/spaces/hallucinations-leaderboard/leaderboard/tree/main/src/backend/tasks)).
80
 
 
108
  - {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
109
  - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
110
  Specific fine-tune subcategories (more adapted to chat):
111
+ - {ModelType.chat.to_str(" : ")} model: chat models (RLHF, DPO, IFT, ...).
112
+ - {ModelType.merges.to_str(" : ")} model: base merges and moerges.
113
+ - {ModelType.Unknown.to_str(" : ")} model: Unknown model type
114
  If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
115
  """
116
 
src/display/utils.py CHANGED
@@ -106,9 +106,9 @@ class ModelDetails:
106
 
107
  class ModelType(Enum):
108
  PT = ModelDetails(name="pretrained", symbol="🟢")
109
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
110
- IFT = ModelDetails(name="instruction-tuned", symbol="")
111
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
112
  Unknown = ModelDetails(name="", symbol="?")
113
 
114
  def to_str(self, separator=" "):
@@ -120,10 +120,10 @@ class ModelType(Enum):
120
  return ModelType.FT
121
  if "pretrained" in type or "🟢" in type:
122
  return ModelType.PT
123
- if "RL-tuned" in type or "🟦" in type:
124
- return ModelType.RL
125
- if "instruction-tuned" in type or "" in type:
126
- return ModelType.IFT
127
  return ModelType.Unknown
128
 
129
 
 
106
 
107
  class ModelType(Enum):
108
  PT = ModelDetails(name="pretrained", symbol="🟢")
109
+ FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
110
+ chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
111
+ merges = ModelDetails(name="base merges and moerges", symbol="🤝")
112
  Unknown = ModelDetails(name="", symbol="?")
113
 
114
  def to_str(self, separator=" "):
 
120
  return ModelType.FT
121
  if "pretrained" in type or "🟢" in type:
122
  return ModelType.PT
123
+ if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
124
+ return ModelType.chat
125
+ if "merge" in type or "🤝" in type:
126
+ return ModelType.merges
127
  return ModelType.Unknown
128
 
129
 
src/envs.py CHANGED
@@ -9,6 +9,7 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
9
  REPO_ID = "hallucinations-leaderboard/leaderboard"
10
 
11
  QUEUE_REPO = "hallucinations-leaderboard/requests"
 
12
  RESULTS_REPO = "hallucinations-leaderboard/results"
13
 
14
  PRIVATE_QUEUE_REPO = "hallucinations-leaderboard/private-requests"
@@ -20,6 +21,7 @@ CACHE_PATH = os.getenv("HF_HOME", ".")
20
 
21
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
22
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
23
 
24
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
25
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
 
9
  REPO_ID = "hallucinations-leaderboard/leaderboard"
10
 
11
  QUEUE_REPO = "hallucinations-leaderboard/requests"
12
+ QUEUE_REPO_OPEN_LLM = "open-llm-leaderboard/requests"
13
  RESULTS_REPO = "hallucinations-leaderboard/results"
14
 
15
  PRIVATE_QUEUE_REPO = "hallucinations-leaderboard/private-requests"
 
21
 
22
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
23
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
24
+ EVAL_REQUESTS_PATH_OPEN_LLM = os.path.join(CACHE_PATH, "eval-queue-open-llm")
25
 
26
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
27
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
src/leaderboard/read_evals.py CHANGED
@@ -1,6 +1,7 @@
1
  import glob
2
  import json
3
  import os
 
4
  from dataclasses import dataclass
5
 
6
  import dateutil
@@ -125,6 +126,18 @@ class EvalResult:
125
  except Exception as e:
126
  print(f"Could not find request file for {self.org}/{self.model} -- path: {requests_path} -- {e}")
127
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  def is_complete(self) -> bool:
129
  for task in Tasks:
130
  if task.value.benchmark not in self.results:
@@ -180,8 +193,29 @@ def get_request_file_for_model(requests_path, model_name, precision):
180
  request_file = tmp_request_file
181
  return request_file
182
 
 
 
 
 
 
 
 
183
 
184
- def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool = False) -> list[EvalResult]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  """From the path of the results folder root, extract all needed info for results"""
186
  model_result_filepaths = []
187
 
@@ -200,11 +234,12 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
200
  model_result_filepaths.append(os.path.join(root, file))
201
 
202
  eval_results = {}
203
- for model_result_filepath in model_result_filepaths:
204
  # Creation of result
205
  eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
206
  eval_result.update_with_request_file(requests_path)
207
-
 
208
  # Store results of same eval together
209
  eval_name = eval_result.eval_name
210
  if eval_name in eval_results.keys():
 
1
  import glob
2
  import json
3
  import os
4
+ from tqdm import tqdm
5
  from dataclasses import dataclass
6
 
7
  import dateutil
 
126
  except Exception as e:
127
  print(f"Could not find request file for {self.org}/{self.model} -- path: {requests_path} -- {e}")
128
 
129
+ def update_model_type_with_open_llm_request_file(self, open_llm_requests_path):
130
+ """Finds the relevant request file for the current model and updates info with it"""
131
+ request_file = get_request_file_for_model_open_llm(open_llm_requests_path, self.full_model, self.precision.value.name)
132
+
133
+ if request_file:
134
+ try:
135
+ with open(request_file, "r") as f:
136
+ request = json.load(f)
137
+ self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
138
+ except Exception as e:
139
+ pass
140
+
141
  def is_complete(self) -> bool:
142
  for task in Tasks:
143
  if task.value.benchmark not in self.results:
 
193
  request_file = tmp_request_file
194
  return request_file
195
 
196
+ def get_request_file_for_model_open_llm(requests_path, model_name, precision):
197
+ """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
198
+ request_files = os.path.join(
199
+ requests_path,
200
+ f"{model_name}_eval_request_*.json",
201
+ )
202
+ request_files = glob.glob(request_files)
203
 
204
+ # Select correct request file (precision)
205
+ request_file = ""
206
+ request_files = sorted(request_files, reverse=True)
207
+ for tmp_request_file in request_files:
208
+ with open(tmp_request_file, "r") as f:
209
+ req_content = json.load(f)
210
+ if (
211
+ req_content["status"] in ["FINISHED"]
212
+ and req_content["precision"] == precision.split(".")[-1]
213
+ ):
214
+ request_file = tmp_request_file
215
+ return request_file
216
+
217
+
218
+ def get_raw_eval_results(results_path: str, requests_path: str, requests_path_open_llm: str, is_backend: bool = False) -> list[EvalResult]:
219
  """From the path of the results folder root, extract all needed info for results"""
220
  model_result_filepaths = []
221
 
 
234
  model_result_filepaths.append(os.path.join(root, file))
235
 
236
  eval_results = {}
237
+ for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
238
  # Creation of result
239
  eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
240
  eval_result.update_with_request_file(requests_path)
241
+ if requests_path_open_llm:
242
+ eval_result.update_model_type_with_open_llm_request_file(requests_path_open_llm)
243
  # Store results of same eval together
244
  eval_name = eval_result.eval_name
245
  if eval_name in eval_results.keys():
src/populate.py CHANGED
@@ -15,11 +15,12 @@ from src.display.utils import Tasks
15
 
16
  def get_leaderboard_df(results_path: str,
17
  requests_path: str,
 
18
  cols: list,
19
  benchmark_cols: list,
20
  is_backend: bool = False) -> tuple[list[EvalResult], pd.DataFrame]:
21
  # Returns a list of EvalResult
22
- raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path)
23
 
24
  all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
25
 
 
15
 
16
  def get_leaderboard_df(results_path: str,
17
  requests_path: str,
18
+ requests_path_open_llm: str,
19
  cols: list,
20
  benchmark_cols: list,
21
  is_backend: bool = False) -> tuple[list[EvalResult], pd.DataFrame]:
22
  # Returns a list of EvalResult
23
+ raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm)
24
 
25
  all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
26