albertvillanova HF staff commited on
Commit
d295ed3
1 Parent(s): 79f86f5

Fix E402 module-import-not-at-top-of-file

Browse files
app.py CHANGED
@@ -1,24 +1,22 @@
1
  import logging
2
- from apscheduler.schedulers.background import BackgroundScheduler
3
-
4
- from src.logging import configure_root_logger
5
-
6
- logging.getLogger("httpx").setLevel(logging.WARNING)
7
- logging.getLogger("numexpr").setLevel(logging.WARNING)
8
- logging.getLogger("absl").setLevel(logging.WARNING)
9
- configure_root_logger()
10
-
11
  from functools import partial
12
 
13
  import gradio as gr
 
 
14
  # Choose ligtheval or harness backend
15
  from main_backend_lighteval import run_auto_eval
16
- #from main_backend_harness import run_auto_eval
17
 
18
  from src.display.log_visualizer import log_file_to_html_string
19
  from src.display.css_html_js import dark_mode_gradio_js
20
  from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
21
- from src.logging import setup_logger, log_file
 
 
 
 
 
22
 
23
  logging.basicConfig(level=logging.INFO)
24
  logger = setup_logger(__name__)
@@ -39,6 +37,7 @@ links_md = f"""
39
  | Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
40
  """
41
 
 
42
  def auto_eval():
43
  logger.info("Triggering Auto Eval")
44
  run_auto_eval()
@@ -52,20 +51,18 @@ with gr.Blocks(js=dark_mode_gradio_js) as demo:
52
  output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
53
  with gr.Row():
54
  download_button = gr.DownloadButton("Download Log File", value=log_file)
55
- with gr.Accordion('Log View Configuration', open=False):
56
  reverse_order_checkbox.render()
57
  # Add a button that when pressed, triggers run_auto_eval
58
  button = gr.Button("Manually Run Evaluation")
59
  gr.Markdown(links_md)
60
 
61
- #dummy = gr.Markdown(auto_eval, every=REFRESH_RATE, visible=False)
62
 
63
  button.click(fn=auto_eval, inputs=[], outputs=[])
64
 
65
- if __name__ == '__main__':
66
  scheduler = BackgroundScheduler()
67
  scheduler.add_job(auto_eval, "interval", seconds=REFRESH_RATE)
68
  scheduler.start()
69
- demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0",
70
- show_error=True,
71
- server_port=7860)
 
1
  import logging
 
 
 
 
 
 
 
 
 
2
  from functools import partial
3
 
4
  import gradio as gr
5
+ from apscheduler.schedulers.background import BackgroundScheduler
6
+
7
  # Choose ligtheval or harness backend
8
  from main_backend_lighteval import run_auto_eval
9
+ # from main_backend_harness import run_auto_eval
10
 
11
  from src.display.log_visualizer import log_file_to_html_string
12
  from src.display.css_html_js import dark_mode_gradio_js
13
  from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
14
+ from src.logging import configure_root_logger, setup_logger, log_file
15
+
16
+ logging.getLogger("httpx").setLevel(logging.WARNING)
17
+ logging.getLogger("numexpr").setLevel(logging.WARNING)
18
+ logging.getLogger("absl").setLevel(logging.WARNING)
19
+ configure_root_logger()
20
 
21
  logging.basicConfig(level=logging.INFO)
22
  logger = setup_logger(__name__)
 
37
  | Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
38
  """
39
 
40
+
41
  def auto_eval():
42
  logger.info("Triggering Auto Eval")
43
  run_auto_eval()
 
51
  output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
52
  with gr.Row():
53
  download_button = gr.DownloadButton("Download Log File", value=log_file)
54
+ with gr.Accordion("Log View Configuration", open=False):
55
  reverse_order_checkbox.render()
56
  # Add a button that when pressed, triggers run_auto_eval
57
  button = gr.Button("Manually Run Evaluation")
58
  gr.Markdown(links_md)
59
 
60
+ # dummy = gr.Markdown(auto_eval, every=REFRESH_RATE, visible=False)
61
 
62
  button.click(fn=auto_eval, inputs=[], outputs=[])
63
 
64
+ if __name__ == "__main__":
65
  scheduler = BackgroundScheduler()
66
  scheduler.add_job(auto_eval, "interval", seconds=REFRESH_RATE)
67
  scheduler.start()
68
+ demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)
 
 
custom_tasks.py CHANGED
@@ -6,6 +6,7 @@ This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then im
6
 
7
  Author:
8
  """
 
9
  from lighteval.tasks.lighteval_task import LightevalTaskConfig
10
  from lighteval.tasks.requests import Doc
11
  from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
6
 
7
  Author:
8
  """
9
+
10
  from lighteval.tasks.lighteval_task import LightevalTaskConfig
11
  from lighteval.tasks.requests import Doc
12
  from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
main_backend_harness.py CHANGED
@@ -3,25 +3,55 @@ import pprint
3
 
4
  from huggingface_hub import snapshot_download
5
 
6
- logging.getLogger("openai").setLevel(logging.WARNING)
7
-
8
  from src.backend.run_eval_suite_harness import run_evaluation
9
- from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request, PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS
 
 
 
 
 
 
 
 
10
  from src.backend.sort_queue import sort_models_by_priority
11
-
12
- from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
 
 
 
 
 
 
 
 
13
  from src.envs import TASKS_HARNESS, NUM_FEWSHOT
14
  from src.logging import setup_logger
15
 
16
 
 
17
 
18
  # logging.basicConfig(level=logging.ERROR)
19
  logger = setup_logger(__name__)
20
  pp = pprint.PrettyPrinter(width=80)
21
 
22
 
23
- snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
24
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def run_auto_eval():
27
  current_pending_status = [PENDING_STATUS]
@@ -36,11 +66,13 @@ def run_auto_eval():
36
  hf_repo=QUEUE_REPO,
37
  local_dir=EVAL_REQUESTS_PATH_BACKEND,
38
  hf_repo_results=RESULTS_REPO,
39
- local_dir_results=EVAL_RESULTS_PATH_BACKEND
40
  )
41
 
42
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
43
- eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
44
  # Sort the evals by priority (first submitted first run)
45
  eval_requests = sort_models_by_priority(api=API, models=eval_requests)
46
 
@@ -61,16 +93,16 @@ def run_auto_eval():
61
  )
62
 
63
  run_evaluation(
64
- eval_request=eval_request,
65
- task_names=TASKS_HARNESS,
66
- num_fewshot=NUM_FEWSHOT,
67
  local_dir=EVAL_RESULTS_PATH_BACKEND,
68
  results_repo=RESULTS_REPO,
69
  batch_size="auto",
70
- device=DEVICE,
71
- limit=LIMIT
72
- )
73
 
74
 
75
  if __name__ == "__main__":
76
- run_auto_eval()
 
3
 
4
  from huggingface_hub import snapshot_download
5
 
 
 
6
  from src.backend.run_eval_suite_harness import run_evaluation
7
+ from src.backend.manage_requests import (
8
+ check_completed_evals,
9
+ get_eval_requests,
10
+ set_eval_request,
11
+ PENDING_STATUS,
12
+ RUNNING_STATUS,
13
+ FINISHED_STATUS,
14
+ FAILED_STATUS,
15
+ )
16
  from src.backend.sort_queue import sort_models_by_priority
17
+ from src.envs import (
18
+ QUEUE_REPO,
19
+ EVAL_REQUESTS_PATH_BACKEND,
20
+ RESULTS_REPO,
21
+ EVAL_RESULTS_PATH_BACKEND,
22
+ DEVICE,
23
+ API,
24
+ LIMIT,
25
+ TOKEN,
26
+ )
27
  from src.envs import TASKS_HARNESS, NUM_FEWSHOT
28
  from src.logging import setup_logger
29
 
30
 
31
+ logging.getLogger("openai").setLevel(logging.WARNING)
32
 
33
  # logging.basicConfig(level=logging.ERROR)
34
  logger = setup_logger(__name__)
35
  pp = pprint.PrettyPrinter(width=80)
36
 
37
 
38
+ snapshot_download(
39
+ repo_id=RESULTS_REPO,
40
+ revision="main",
41
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
42
+ repo_type="dataset",
43
+ max_workers=60,
44
+ token=TOKEN,
45
+ )
46
+ snapshot_download(
47
+ repo_id=QUEUE_REPO,
48
+ revision="main",
49
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
50
+ repo_type="dataset",
51
+ max_workers=60,
52
+ token=TOKEN,
53
+ )
54
+
55
 
56
  def run_auto_eval():
57
  current_pending_status = [PENDING_STATUS]
 
66
  hf_repo=QUEUE_REPO,
67
  local_dir=EVAL_REQUESTS_PATH_BACKEND,
68
  hf_repo_results=RESULTS_REPO,
69
+ local_dir_results=EVAL_RESULTS_PATH_BACKEND,
70
  )
71
 
72
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
73
+ eval_requests = get_eval_requests(
74
+ job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
75
+ )
76
  # Sort the evals by priority (first submitted first run)
77
  eval_requests = sort_models_by_priority(api=API, models=eval_requests)
78
 
 
93
  )
94
 
95
  run_evaluation(
96
+ eval_request=eval_request,
97
+ task_names=TASKS_HARNESS,
98
+ num_fewshot=NUM_FEWSHOT,
99
  local_dir=EVAL_RESULTS_PATH_BACKEND,
100
  results_repo=RESULTS_REPO,
101
  batch_size="auto",
102
+ device=DEVICE,
103
+ limit=LIMIT,
104
+ )
105
 
106
 
107
  if __name__ == "__main__":
108
+ run_auto_eval()
main_backend_lighteval.py CHANGED
@@ -3,22 +3,58 @@ import pprint
3
 
4
  from huggingface_hub import snapshot_download
5
 
6
- logging.getLogger("openai").setLevel(logging.WARNING)
7
 
8
  from src.backend.run_eval_suite_lighteval import run_evaluation
9
- from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request, PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS
 
 
 
 
 
 
 
 
10
  from src.backend.sort_queue import sort_models_by_priority
11
-
12
- from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION, TASKS_LIGHTEVAL
 
 
 
 
 
 
 
 
 
 
 
13
  from src.logging import setup_logger
14
 
 
 
 
15
  logger = setup_logger(__name__)
16
 
17
  # logging.basicConfig(level=logging.ERROR)
18
  pp = pprint.PrettyPrinter(width=80)
19
 
20
- snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
21
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def run_auto_eval():
24
  current_pending_status = [PENDING_STATUS]
@@ -33,11 +69,13 @@ def run_auto_eval():
33
  hf_repo=QUEUE_REPO,
34
  local_dir=EVAL_REQUESTS_PATH_BACKEND,
35
  hf_repo_results=RESULTS_REPO,
36
- local_dir_results=EVAL_RESULTS_PATH_BACKEND
37
  )
38
 
39
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
40
- eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
41
  # Sort the evals by priority (first submitted first run)
42
  eval_requests = sort_models_by_priority(api=API, models=eval_requests)
43
 
@@ -49,7 +87,6 @@ def run_auto_eval():
49
  eval_request = eval_requests[0]
50
  logger.info(pp.pformat(eval_request))
51
 
52
-
53
  set_eval_request(
54
  api=API,
55
  eval_request=eval_request,
@@ -59,29 +96,33 @@ def run_auto_eval():
59
  )
60
 
61
  # This needs to be done
62
- #instance_size, instance_type = get_instance_for_model(eval_request)
63
  # For GPU
64
- # instance_size, instance_type = "small", "g4dn.xlarge"
65
  # For CPU
66
  # Updated naming available at https://huggingface.co/docs/inference-endpoints/pricing
67
  instance_size, instance_type = "x4", "intel-icl"
68
- logger.info(f'Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
 
 
69
 
70
  run_evaluation(
71
- eval_request=eval_request,
72
- task_names=TASKS_LIGHTEVAL,
73
  local_dir=EVAL_RESULTS_PATH_BACKEND,
74
- batch_size=1,
75
- accelerator=ACCELERATOR,
76
- region=REGION,
77
- vendor=VENDOR,
78
- instance_size=instance_size,
79
- instance_type=instance_type,
80
- limit=LIMIT
81
- )
82
 
83
- logger.info(f'Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
 
 
84
 
85
 
86
  if __name__ == "__main__":
87
- run_auto_eval()
 
3
 
4
  from huggingface_hub import snapshot_download
5
 
 
6
 
7
  from src.backend.run_eval_suite_lighteval import run_evaluation
8
+ from src.backend.manage_requests import (
9
+ check_completed_evals,
10
+ get_eval_requests,
11
+ set_eval_request,
12
+ PENDING_STATUS,
13
+ RUNNING_STATUS,
14
+ FINISHED_STATUS,
15
+ FAILED_STATUS,
16
+ )
17
  from src.backend.sort_queue import sort_models_by_priority
18
+ from src.envs import (
19
+ QUEUE_REPO,
20
+ EVAL_REQUESTS_PATH_BACKEND,
21
+ RESULTS_REPO,
22
+ EVAL_RESULTS_PATH_BACKEND,
23
+ API,
24
+ LIMIT,
25
+ TOKEN,
26
+ ACCELERATOR,
27
+ VENDOR,
28
+ REGION,
29
+ TASKS_LIGHTEVAL,
30
+ )
31
  from src.logging import setup_logger
32
 
33
+
34
+ logging.getLogger("openai").setLevel(logging.WARNING)
35
+
36
  logger = setup_logger(__name__)
37
 
38
  # logging.basicConfig(level=logging.ERROR)
39
  pp = pprint.PrettyPrinter(width=80)
40
 
41
+ snapshot_download(
42
+ repo_id=RESULTS_REPO,
43
+ revision="main",
44
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
45
+ repo_type="dataset",
46
+ max_workers=60,
47
+ token=TOKEN,
48
+ )
49
+ snapshot_download(
50
+ repo_id=QUEUE_REPO,
51
+ revision="main",
52
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
53
+ repo_type="dataset",
54
+ max_workers=60,
55
+ token=TOKEN,
56
+ )
57
+
58
 
59
  def run_auto_eval():
60
  current_pending_status = [PENDING_STATUS]
 
69
  hf_repo=QUEUE_REPO,
70
  local_dir=EVAL_REQUESTS_PATH_BACKEND,
71
  hf_repo_results=RESULTS_REPO,
72
+ local_dir_results=EVAL_RESULTS_PATH_BACKEND,
73
  )
74
 
75
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
76
+ eval_requests = get_eval_requests(
77
+ job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
78
+ )
79
  # Sort the evals by priority (first submitted first run)
80
  eval_requests = sort_models_by_priority(api=API, models=eval_requests)
81
 
 
87
  eval_request = eval_requests[0]
88
  logger.info(pp.pformat(eval_request))
89
 
 
90
  set_eval_request(
91
  api=API,
92
  eval_request=eval_request,
 
96
  )
97
 
98
  # This needs to be done
99
+ # instance_size, instance_type = get_instance_for_model(eval_request)
100
  # For GPU
101
+ # instance_size, instance_type = "small", "g4dn.xlarge"
102
  # For CPU
103
  # Updated naming available at https://huggingface.co/docs/inference-endpoints/pricing
104
  instance_size, instance_type = "x4", "intel-icl"
105
+ logger.info(
106
+ f"Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
107
+ )
108
 
109
  run_evaluation(
110
+ eval_request=eval_request,
111
+ task_names=TASKS_LIGHTEVAL,
112
  local_dir=EVAL_RESULTS_PATH_BACKEND,
113
+ batch_size=1,
114
+ accelerator=ACCELERATOR,
115
+ region=REGION,
116
+ vendor=VENDOR,
117
+ instance_size=instance_size,
118
+ instance_type=instance_type,
119
+ limit=LIMIT,
120
+ )
121
 
122
+ logger.info(
123
+ f"Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
124
+ )
125
 
126
 
127
  if __name__ == "__main__":
128
+ run_auto_eval()
scripts/create_request_file.py CHANGED
@@ -34,7 +34,9 @@ def get_model_size(model_info, precision: str):
34
  def main():
35
  api = HfApi()
36
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
37
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
 
 
38
 
39
  model_name = click.prompt("Enter model name")
40
  revision = click.prompt("Enter revision", default="main")
 
34
  def main():
35
  api = HfApi()
36
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
37
+ snapshot_download(
38
+ repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
39
+ )
40
 
41
  model_name = click.prompt("Enter model name")
42
  revision = click.prompt("Enter revision", default="main")
scripts/fix_harness_import.py CHANGED
@@ -2,10 +2,11 @@
2
  It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
3
  It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
4
  """
 
5
  import os
6
 
7
  import lm_eval
8
 
9
  if __name__ == "__main__":
10
  lm_eval_path = lm_eval.__path__[0]
11
- os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
 
2
  It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
3
  It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
4
  """
5
+
6
  import os
7
 
8
  import lm_eval
9
 
10
  if __name__ == "__main__":
11
  lm_eval_path = lm_eval.__path__[0]
12
+ os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
src/backend/manage_requests.py CHANGED
@@ -14,27 +14,30 @@ RUNNING_STATUS = "RUNNING"
14
  FINISHED_STATUS = "FINISHED"
15
  FAILED_STATUS = "FAILED"
16
 
 
17
  @dataclass
18
  class EvalRequest:
19
- """This class represents one evaluation request file.
20
- """
21
  model: str
22
  status: str
23
  json_filepath: str
24
  weight_type: str = "Original"
25
  model_type: str = "" # pretrained, finetuned, with RL
26
  precision: str = "" # float16, bfloat16
27
- revision: str = "main" # commit hash
28
- submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
29
- model_type: Optional[str] = None # pretrained, fine-tuned, etc - define your own categories in
 
 
30
  likes: Optional[int] = 0
31
  params: Optional[int] = None
32
  license: Optional[str] = ""
33
  base_model: Optional[str] = ""
34
  private: Optional[bool] = False
35
-
36
  def get_model_args(self):
37
- """Edit this function if you want to manage more complex quantization issues. You'll need to map it to
38
  the evaluation suite you chose.
39
  """
40
  model_args = f"pretrained={self.model},revision={self.revision}"
@@ -45,7 +48,7 @@ class EvalRequest:
45
  # Quantized models need some added config, the install of bits and bytes, etc
46
  else:
47
  raise Exception(f"Unknown precision {self.precision}.")
48
-
49
  return model_args
50
 
51
 
@@ -77,7 +80,9 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
77
  Returns:
78
  `list[EvalRequest]`: a list of model info dicts.
79
  """
80
- snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN)
 
 
81
  json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
82
 
83
  eval_requests = []
@@ -102,6 +107,7 @@ def eval_was_running(eval_request: EvalRequest):
102
  status = data["status"]
103
  return status == RUNNING_STATUS
104
 
 
105
  def check_completed_evals(
106
  api: HfApi,
107
  hf_repo: str,
@@ -114,12 +120,12 @@ def check_completed_evals(
114
  ):
115
  """Checks if the currently running evals are completed, if yes, update their status on the hub."""
116
  snapshot_download(
117
- repo_id=hf_repo_results,
118
- revision="main",
119
- local_dir=local_dir_results,
120
- repo_type="dataset",
121
- max_workers=60,
122
- token=TOKEN
123
  )
124
 
125
  running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
@@ -134,13 +140,9 @@ def check_completed_evals(
134
  output_file_exists = len(glob.glob(output_file)) > 0
135
 
136
  if output_file_exists:
137
- logger.info(
138
- f"EXISTS output file exists for {model} setting it to {completed_status}"
139
- )
140
  set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
141
  else:
142
  if eval_was_running(eval_request=eval_request):
143
- logger.info(
144
- f"No result file found for {model} setting it to {failed_status}"
145
- )
146
  set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
 
14
  FINISHED_STATUS = "FINISHED"
15
  FAILED_STATUS = "FAILED"
16
 
17
+
18
  @dataclass
19
  class EvalRequest:
20
+ """This class represents one evaluation request file."""
21
+
22
  model: str
23
  status: str
24
  json_filepath: str
25
  weight_type: str = "Original"
26
  model_type: str = "" # pretrained, finetuned, with RL
27
  precision: str = "" # float16, bfloat16
28
+ revision: str = "main" # commit hash
29
+ submitted_time: Optional[str] = (
30
+ "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
31
+ )
32
+ model_type: Optional[str] = None # pretrained, fine-tuned, etc - define your own categories in
33
  likes: Optional[int] = 0
34
  params: Optional[int] = None
35
  license: Optional[str] = ""
36
  base_model: Optional[str] = ""
37
  private: Optional[bool] = False
38
+
39
  def get_model_args(self):
40
+ """Edit this function if you want to manage more complex quantization issues. You'll need to map it to
41
  the evaluation suite you chose.
42
  """
43
  model_args = f"pretrained={self.model},revision={self.revision}"
 
48
  # Quantized models need some added config, the install of bits and bytes, etc
49
  else:
50
  raise Exception(f"Unknown precision {self.precision}.")
51
+
52
  return model_args
53
 
54
 
 
80
  Returns:
81
  `list[EvalRequest]`: a list of model info dicts.
82
  """
83
+ snapshot_download(
84
+ repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
85
+ )
86
  json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
87
 
88
  eval_requests = []
 
107
  status = data["status"]
108
  return status == RUNNING_STATUS
109
 
110
+
111
  def check_completed_evals(
112
  api: HfApi,
113
  hf_repo: str,
 
120
  ):
121
  """Checks if the currently running evals are completed, if yes, update their status on the hub."""
122
  snapshot_download(
123
+ repo_id=hf_repo_results,
124
+ revision="main",
125
+ local_dir=local_dir_results,
126
+ repo_type="dataset",
127
+ max_workers=60,
128
+ token=TOKEN,
129
  )
130
 
131
  running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
 
140
  output_file_exists = len(glob.glob(output_file)) > 0
141
 
142
  if output_file_exists:
143
+ logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
 
 
144
  set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
145
  else:
146
  if eval_was_running(eval_request=eval_request):
147
+ logger.info(f"No result file found for {model} setting it to {failed_status}")
 
 
148
  set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
src/backend/run_eval_suite_harness.py CHANGED
@@ -15,7 +15,18 @@ from typing import Union
15
  logging.getLogger("openai").setLevel(logging.WARNING)
16
  logger = setup_logger(__name__)
17
 
18
- def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: Union[int, str], device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
 
 
 
 
 
 
 
 
 
 
 
19
  """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
20
 
21
  Args:
@@ -51,7 +62,7 @@ def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int
51
  batch_size=batch_size,
52
  device=device,
53
  limit=limit,
54
- write_out=True # Whether to write out an example document and model input, for checking task integrity
55
  )
56
 
57
  results["config"]["model_dtype"] = eval_request.precision
 
15
  logging.getLogger("openai").setLevel(logging.WARNING)
16
  logger = setup_logger(__name__)
17
 
18
+
19
+ def run_evaluation(
20
+ eval_request: EvalRequest,
21
+ task_names: list,
22
+ num_fewshot: int,
23
+ batch_size: Union[int, str],
24
+ device: str,
25
+ local_dir: str,
26
+ results_repo: str,
27
+ no_cache: bool = True,
28
+ limit: int = None,
29
+ ):
30
  """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
31
 
32
  Args:
 
62
  batch_size=batch_size,
63
  device=device,
64
  limit=limit,
65
+ write_out=True, # Whether to write out an example document and model input, for checking task integrity
66
  )
67
 
68
  results["config"]["model_dtype"] = eval_request.precision
src/backend/run_eval_suite_lighteval.py CHANGED
@@ -13,7 +13,19 @@ from src.logging import setup_logger
13
  logging.getLogger("openai").setLevel(logging.WARNING)
14
  logger = setup_logger(__name__)
15
 
16
- def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
 
 
 
 
 
 
 
 
 
 
 
 
17
  """Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
18
 
19
  Args:
@@ -28,18 +40,20 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
28
  local_dir (str): Where to save the results locally
29
  no_cache (bool, optional): Whether to use a cache or not.
30
  limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
31
- """
32
 
33
  if limit:
34
- logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
 
 
35
 
36
  evaluation_tracker = EvaluationTracker(
37
  output_dir="./results",
38
- save_details = True,
39
- push_to_hub = True,
40
- push_to_tensorboard = False,
41
- hub_results_org= RESULTS_REPO,
42
- public = False,
43
  )
44
 
45
  pipeline_params = PipelineParameters(
@@ -48,21 +62,21 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
48
  max_samples=limit,
49
  use_chat_template=False,
50
  system_prompt=None,
51
- custom_tasks_directory="custom_tasks.py", # if using a custom task
52
  )
53
 
54
  model_config = InferenceEndpointModelConfig(
55
  # Endpoint parameters
56
- name = eval_request.model.replace(".", "-").lower(),
57
- repository = eval_request.model,
58
- accelerator = accelerator,
59
- vendor= vendor,
60
- region= region,
61
- instance_size= instance_size,
62
- instance_type= instance_type,
63
- should_reuse_existing= False,
64
- model_dtype= eval_request.precision,
65
- revision= eval_request.revision,
66
  )
67
 
68
  pipeline = Pipeline(
@@ -81,7 +95,7 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
81
  dumped = json.dumps(results, indent=2)
82
  logger.info(dumped)
83
 
84
- except Exception: # if eval failed, we force a cleanup
85
  pipeline.model.cleanup()
86
 
87
  return results
 
13
  logging.getLogger("openai").setLevel(logging.WARNING)
14
  logger = setup_logger(__name__)
15
 
16
+
17
+ def run_evaluation(
18
+ eval_request: EvalRequest,
19
+ task_names: str,
20
+ batch_size: int,
21
+ local_dir: str,
22
+ accelerator: str,
23
+ region: str,
24
+ vendor: str,
25
+ instance_size: str,
26
+ instance_type: str,
27
+ limit=None,
28
+ ):
29
  """Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
30
 
31
  Args:
 
40
  local_dir (str): Where to save the results locally
41
  no_cache (bool, optional): Whether to use a cache or not.
42
  limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
43
+ """
44
 
45
  if limit:
46
+ logger.info(
47
+ "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
48
+ )
49
 
50
  evaluation_tracker = EvaluationTracker(
51
  output_dir="./results",
52
+ save_details=True,
53
+ push_to_hub=True,
54
+ push_to_tensorboard=False,
55
+ hub_results_org=RESULTS_REPO,
56
+ public=False,
57
  )
58
 
59
  pipeline_params = PipelineParameters(
 
62
  max_samples=limit,
63
  use_chat_template=False,
64
  system_prompt=None,
65
+ custom_tasks_directory="custom_tasks.py", # if using a custom task
66
  )
67
 
68
  model_config = InferenceEndpointModelConfig(
69
  # Endpoint parameters
70
+ name=eval_request.model.replace(".", "-").lower(),
71
+ repository=eval_request.model,
72
+ accelerator=accelerator,
73
+ vendor=vendor,
74
+ region=region,
75
+ instance_size=instance_size,
76
+ instance_type=instance_type,
77
+ should_reuse_existing=False,
78
+ model_dtype=eval_request.precision,
79
+ revision=eval_request.revision,
80
  )
81
 
82
  pipeline = Pipeline(
 
95
  dumped = json.dumps(results, indent=2)
96
  logger.info(dumped)
97
 
98
+ except Exception: # if eval failed, we force a cleanup
99
  pipeline.model.cleanup()
100
 
101
  return results
src/backend/sort_queue.py CHANGED
@@ -10,6 +10,7 @@ class ModelMetadata:
10
  likes: int = 0
11
  size: int = 15
12
 
 
13
  # All the functions below sort the models in the queue based on different parameters
14
  def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
15
  private_models = [model for model in models if model.private]
@@ -17,11 +18,14 @@ def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalR
17
 
18
  return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
19
 
 
20
  def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
21
  return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
22
 
 
23
  def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
24
  return sorted(eval_requests, key=lambda x: x.params, reverse=False)
25
 
 
26
  def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
27
- return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
 
10
  likes: int = 0
11
  size: int = 15
12
 
13
+
14
  # All the functions below sort the models in the queue based on different parameters
15
  def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
16
  private_models = [model for model in models if model.private]
 
18
 
19
  return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
20
 
21
+
22
  def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
23
  return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
24
 
25
+
26
  def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
27
  return sorted(eval_requests, key=lambda x: x.params, reverse=False)
28
 
29
+
30
  def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
31
+ return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
src/display/log_visualizer.py CHANGED
@@ -11,8 +11,8 @@ from src.logging import log_file
11
 
12
  def log_file_to_html_string(reverse=True):
13
  with open(log_file, "rt") as f:
14
- lines = f.readlines()
15
- lines = lines[-NUM_LINES_VISUALIZE:]
16
 
17
  if reverse:
18
  lines = reversed(lines)
@@ -25,12 +25,12 @@ def log_file_to_html_string(reverse=True):
25
  html_content = console.export_html(inline_styles=True)
26
 
27
  # Parse the HTML content using BeautifulSoup
28
- soup = BeautifulSoup(html_content, 'lxml')
29
 
30
  # Modify the <pre> tag and add custom styles
31
  pre_tag = soup.pre
32
- pre_tag['class'] = 'scrollable'
33
- del pre_tag['style']
34
 
35
  # Add your custom styles and the .scrollable CSS to the <style> tag
36
  style_tag = soup.style
 
11
 
12
  def log_file_to_html_string(reverse=True):
13
  with open(log_file, "rt") as f:
14
+ lines = f.readlines()
15
+ lines = lines[-NUM_LINES_VISUALIZE:]
16
 
17
  if reverse:
18
  lines = reversed(lines)
 
25
  html_content = console.export_html(inline_styles=True)
26
 
27
  # Parse the HTML content using BeautifulSoup
28
+ soup = BeautifulSoup(html_content, "lxml")
29
 
30
  # Modify the <pre> tag and add custom styles
31
  pre_tag = soup.pre
32
+ pre_tag["class"] = "scrollable"
33
+ del pre_tag["style"]
34
 
35
  # Add your custom styles and the .scrollable CSS to the <style> tag
36
  style_tag = soup.style
src/envs.py CHANGED
@@ -4,21 +4,21 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset
10
 
11
  # For harness evaluations
12
- DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
13
- LIMIT = 20 # !!!! For testing, should be None for actual evaluations!!!
14
- NUM_FEWSHOT = 0 # Change with your few shot for the Harness evaluations
15
  TASKS_HARNESS = ["anli_r1", "logiqa"]
16
 
17
  # For lighteval evaluations
18
  ACCELERATOR = "cpu"
19
  REGION = "us-east-1"
20
  VENDOR = "aws"
21
- TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
22
  # To add your own tasks, edit the custom file and launch it with `custom|myothertask|0|0``
23
 
24
  # ---------------------------------------------------
@@ -27,7 +27,7 @@ QUEUE_REPO = f"{OWNER}/requests"
27
  RESULTS_REPO = f"{OWNER}/results"
28
 
29
  # If you setup a cache later, just change HF_HOME
30
- CACHE_PATH=os.getenv("HF_HOME", ".")
31
 
32
  # Local caches
33
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
@@ -39,4 +39,3 @@ REFRESH_RATE = 10 * 60 # 10 min
39
  NUM_LINES_VISUALIZE = 300
40
 
41
  API = HfApi(token=TOKEN)
42
-
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset
10
 
11
  # For harness evaluations
12
+ DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
13
+ LIMIT = 20 # !!!! For testing, should be None for actual evaluations!!!
14
+ NUM_FEWSHOT = 0 # Change with your few shot for the Harness evaluations
15
  TASKS_HARNESS = ["anli_r1", "logiqa"]
16
 
17
  # For lighteval evaluations
18
  ACCELERATOR = "cpu"
19
  REGION = "us-east-1"
20
  VENDOR = "aws"
21
+ TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
22
  # To add your own tasks, edit the custom file and launch it with `custom|myothertask|0|0``
23
 
24
  # ---------------------------------------------------
 
27
  RESULTS_REPO = f"{OWNER}/results"
28
 
29
  # If you setup a cache later, just change HF_HOME
30
+ CACHE_PATH = os.getenv("HF_HOME", ".")
31
 
32
  # Local caches
33
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
39
  NUM_LINES_VISUALIZE = 300
40
 
41
  API = HfApi(token=TOKEN)
 
src/logging.py CHANGED
@@ -1,18 +1,16 @@
 
1
  from pathlib import Path
2
 
3
  proj_dir = Path(__file__).parents[1]
4
 
5
- log_file = proj_dir/"output.log"
6
-
7
-
8
- import logging
9
 
10
 
11
  def setup_logger(name: str):
12
  logger = logging.getLogger(name)
13
  logger.setLevel(logging.INFO)
14
 
15
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
16
 
17
  # Create a file handler to write logs to a file
18
  file_handler = logging.FileHandler(log_file)
@@ -28,10 +26,10 @@ def configure_root_logger():
28
  logging.basicConfig(level=logging.INFO)
29
  root_logger = logging.getLogger()
30
 
31
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
32
 
33
  file_handler = logging.FileHandler(log_file)
34
  file_handler.setLevel(logging.INFO)
35
  file_handler.setFormatter(formatter)
36
 
37
- root_logger.addHandler(file_handler)
 
1
+ import logging
2
  from pathlib import Path
3
 
4
  proj_dir = Path(__file__).parents[1]
5
 
6
+ log_file = proj_dir / "output.log"
 
 
 
7
 
8
 
9
  def setup_logger(name: str):
10
  logger = logging.getLogger(name)
11
  logger.setLevel(logging.INFO)
12
 
13
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
14
 
15
  # Create a file handler to write logs to a file
16
  file_handler = logging.FileHandler(log_file)
 
26
  logging.basicConfig(level=logging.INFO)
27
  root_logger = logging.getLogger()
28
 
29
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
30
 
31
  file_handler = logging.FileHandler(log_file)
32
  file_handler.setLevel(logging.INFO)
33
  file_handler.setFormatter(formatter)
34
 
35
+ root_logger.addHandler(file_handler)