Spaces:

demo-leaderboard-backend
/

backend

Running on CPU Upgrade

App Files Files Community

Fix style

#12

by albertvillanova HF staff - opened 5 days ago

base: refs/heads/main

←

from: refs/pr/12

Discussion Files changed

+254

-157

Files changed (16) hide show

Makefile +2 -4
app.py +19 -21
custom_tasks.py +1 -0
main_backend_harness.py +50 -17
main_backend_lighteval.py +65 -25
pyproject.toml +6 -9
requirements.txt +0 -1
scripts/create_request_file.py +6 -2
scripts/fix_harness_import.py +3 -1
src/backend/manage_requests.py +25 -21
src/backend/run_eval_suite_harness.py +17 -6
src/backend/run_eval_suite_lighteval.py +36 -26
src/backend/sort_queue.py +5 -2
src/display/log_visualizer.py +5 -6
src/envs.py +8 -8
src/logging.py +6 -8

Makefile CHANGED Viewed

@@ -2,12 +2,10 @@
 style:
-	python -m black --line-length 119 .
-	python -m isort .
 	ruff check --fix .
 quality:
-	python -m black --check --line-length 119 .
-	python -m isort --check-only .
 	ruff check .

 style:
 	ruff check --fix .
+	ruff format .
 quality:
 	ruff check .
+	ruff format --check .

app.py CHANGED Viewed

@@ -1,32 +1,31 @@
 import logging
-from apscheduler.schedulers.background import BackgroundScheduler
-from src.logging import configure_root_logger
-logging.getLogger("httpx").setLevel(logging.WARNING)
-logging.getLogger("numexpr").setLevel(logging.WARNING)
-logging.getLogger("absl").setLevel(logging.WARNING)
-configure_root_logger()
 from functools import partial
 import gradio as gr
 # Choose ligtheval or harness backend
 from main_backend_lighteval import run_auto_eval
-#from main_backend_harness import run_auto_eval
-from src.display.log_visualizer import log_file_to_html_string
 from src.display.css_html_js import dark_mode_gradio_js
-from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
-from src.logging import setup_logger, log_file
 logging.basicConfig(level=logging.INFO)
 logger = setup_logger(__name__)
-intro_md = f"""
 # Intro
-This is a visual for the auto evaluator.
 """
 links_md = f"""
@@ -39,6 +38,7 @@ links_md = f"""
 | Results Repo    | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
 """
 def auto_eval():
     logger.info("Triggering Auto Eval")
     run_auto_eval()
@@ -52,20 +52,18 @@ with gr.Blocks(js=dark_mode_gradio_js) as demo:
         output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
         with gr.Row():
             download_button = gr.DownloadButton("Download Log File", value=log_file)
-            with gr.Accordion('Log View Configuration', open=False):
                 reverse_order_checkbox.render()
         # Add a button that when pressed, triggers run_auto_eval
         button = gr.Button("Manually Run Evaluation")
         gr.Markdown(links_md)
-        #dummy = gr.Markdown(auto_eval, every=REFRESH_RATE, visible=False)
         button.click(fn=auto_eval, inputs=[], outputs=[])
-if __name__ == '__main__':
     scheduler = BackgroundScheduler()
     scheduler.add_job(auto_eval, "interval", seconds=REFRESH_RATE)
     scheduler.start()
-    demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0",
-                                                          show_error=True,
-                                                          server_port=7860)

 import logging
 from functools import partial
 import gradio as gr
+from apscheduler.schedulers.background import BackgroundScheduler
 # Choose ligtheval or harness backend
+# from main_backend_harness import run_auto_eval
 from main_backend_lighteval import run_auto_eval
 from src.display.css_html_js import dark_mode_gradio_js
+from src.display.log_visualizer import log_file_to_html_string
+from src.envs import QUEUE_REPO, REFRESH_RATE, REPO_ID, RESULTS_REPO
+from src.logging import configure_root_logger, log_file, setup_logger
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("numexpr").setLevel(logging.WARNING)
+logging.getLogger("absl").setLevel(logging.WARNING)
+configure_root_logger()
 logging.basicConfig(level=logging.INFO)
 logger = setup_logger(__name__)
+intro_md = """
 # Intro
+This is a visual for the auto evaluator.
 """
 links_md = f"""
 | Results Repo    | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
 """
 def auto_eval():
     logger.info("Triggering Auto Eval")
     run_auto_eval()
         output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
         with gr.Row():
             download_button = gr.DownloadButton("Download Log File", value=log_file)
+            with gr.Accordion("Log View Configuration", open=False):
                 reverse_order_checkbox.render()
         # Add a button that when pressed, triggers run_auto_eval
         button = gr.Button("Manually Run Evaluation")
         gr.Markdown(links_md)
+        # dummy = gr.Markdown(auto_eval, every=REFRESH_RATE, visible=False)
         button.click(fn=auto_eval, inputs=[], outputs=[])
+if __name__ == "__main__":
     scheduler = BackgroundScheduler()
     scheduler.add_job(auto_eval, "interval", seconds=REFRESH_RATE)
     scheduler.start()
+    demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)

custom_tasks.py CHANGED Viewed

@@ -6,6 +6,7 @@ This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then im
 Author:
 """
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES

 Author:
 """
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES

main_backend_harness.py CHANGED Viewed

@@ -3,25 +3,56 @@ import pprint
 from huggingface_hub import snapshot_download
-logging.getLogger("openai").setLevel(logging.WARNING)
 from src.backend.run_eval_suite_harness import run_evaluation
-from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request, PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS
 from src.backend.sort_queue import sort_models_by_priority
-from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
-from src.envs import TASKS_HARNESS, NUM_FEWSHOT
 from src.logging import setup_logger
 # logging.basicConfig(level=logging.ERROR)
 logger = setup_logger(__name__)
 pp = pprint.PrettyPrinter(width=80)
-snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
 def run_auto_eval():
     current_pending_status = [PENDING_STATUS]
@@ -36,11 +67,13 @@ def run_auto_eval():
         hf_repo=QUEUE_REPO,
         local_dir=EVAL_REQUESTS_PATH_BACKEND,
         hf_repo_results=RESULTS_REPO,
-        local_dir_results=EVAL_RESULTS_PATH_BACKEND
     )
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
-    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     # Sort the evals by priority (first submitted first run)
     eval_requests = sort_models_by_priority(api=API, models=eval_requests)
@@ -61,16 +94,16 @@ def run_auto_eval():
     )
     run_evaluation(
-        eval_request=eval_request,
-        task_names=TASKS_HARNESS,
-        num_fewshot=NUM_FEWSHOT,
         local_dir=EVAL_RESULTS_PATH_BACKEND,
         results_repo=RESULTS_REPO,
         batch_size="auto",
-        device=DEVICE,
-        limit=LIMIT
-        )
 if __name__ == "__main__":
-    run_auto_eval()

 from huggingface_hub import snapshot_download
+from src.backend.manage_requests import (
+    FAILED_STATUS,
+    FINISHED_STATUS,
+    PENDING_STATUS,
+    RUNNING_STATUS,
+    check_completed_evals,
+    get_eval_requests,
+    set_eval_request,
+)
 from src.backend.run_eval_suite_harness import run_evaluation
 from src.backend.sort_queue import sort_models_by_priority
+from src.envs import (
+    API,
+    DEVICE,
+    EVAL_REQUESTS_PATH_BACKEND,
+    EVAL_RESULTS_PATH_BACKEND,
+    LIMIT,
+    NUM_FEWSHOT,
+    QUEUE_REPO,
+    RESULTS_REPO,
+    TASKS_HARNESS,
+    TOKEN,
+)
 from src.logging import setup_logger
+logging.getLogger("openai").setLevel(logging.WARNING)
 # logging.basicConfig(level=logging.ERROR)
 logger = setup_logger(__name__)
 pp = pprint.PrettyPrinter(width=80)
+snapshot_download(
+    repo_id=RESULTS_REPO,
+    revision="main",
+    local_dir=EVAL_RESULTS_PATH_BACKEND,
+    repo_type="dataset",
+    max_workers=60,
+    token=TOKEN,
+)
+snapshot_download(
+    repo_id=QUEUE_REPO,
+    revision="main",
+    local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    repo_type="dataset",
+    max_workers=60,
+    token=TOKEN,
+)
 def run_auto_eval():
     current_pending_status = [PENDING_STATUS]
         hf_repo=QUEUE_REPO,
         local_dir=EVAL_REQUESTS_PATH_BACKEND,
         hf_repo_results=RESULTS_REPO,
+        local_dir_results=EVAL_RESULTS_PATH_BACKEND,
     )
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
+    eval_requests = get_eval_requests(
+        job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
+    )
     # Sort the evals by priority (first submitted first run)
     eval_requests = sort_models_by_priority(api=API, models=eval_requests)
     )
     run_evaluation(
+        eval_request=eval_request,
+        task_names=TASKS_HARNESS,
+        num_fewshot=NUM_FEWSHOT,
         local_dir=EVAL_RESULTS_PATH_BACKEND,
         results_repo=RESULTS_REPO,
         batch_size="auto",
+        device=DEVICE,
+        limit=LIMIT,
+    )
 if __name__ == "__main__":
+    run_auto_eval()

main_backend_lighteval.py CHANGED Viewed

@@ -3,22 +3,57 @@ import pprint
 from huggingface_hub import snapshot_download
-logging.getLogger("openai").setLevel(logging.WARNING)
 from src.backend.run_eval_suite_lighteval import run_evaluation
-from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request, PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS
 from src.backend.sort_queue import sort_models_by_priority
-from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION, TASKS_LIGHTEVAL
 from src.logging import setup_logger
 logger = setup_logger(__name__)
 # logging.basicConfig(level=logging.ERROR)
 pp = pprint.PrettyPrinter(width=80)
-snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
 def run_auto_eval():
     current_pending_status = [PENDING_STATUS]
@@ -33,11 +68,13 @@ def run_auto_eval():
         hf_repo=QUEUE_REPO,
         local_dir=EVAL_REQUESTS_PATH_BACKEND,
         hf_repo_results=RESULTS_REPO,
-        local_dir_results=EVAL_RESULTS_PATH_BACKEND
     )
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
-    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     # Sort the evals by priority (first submitted first run)
     eval_requests = sort_models_by_priority(api=API, models=eval_requests)
@@ -49,7 +86,6 @@ def run_auto_eval():
     eval_request = eval_requests[0]
     logger.info(pp.pformat(eval_request))
     set_eval_request(
         api=API,
         eval_request=eval_request,
@@ -59,29 +95,33 @@ def run_auto_eval():
     )
     # This needs to be done
-    #instance_size, instance_type = get_instance_for_model(eval_request)
     # For GPU
-    # instance_size, instance_type = "small", "g4dn.xlarge"
     # For CPU
     # Updated naming available at https://huggingface.co/docs/inference-endpoints/pricing
     instance_size, instance_type = "x4", "intel-icl"
-    logger.info(f'Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
     run_evaluation(
-        eval_request=eval_request,
-        task_names=TASKS_LIGHTEVAL,
         local_dir=EVAL_RESULTS_PATH_BACKEND,
-        batch_size=1,
-        accelerator=ACCELERATOR,
-        region=REGION,
-        vendor=VENDOR,
-        instance_size=instance_size,
-        instance_type=instance_type,
-        limit=LIMIT
-        )
-    logger.info(f'Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
 if __name__ == "__main__":
-    run_auto_eval()

 from huggingface_hub import snapshot_download
+from src.backend.manage_requests import (
+    FAILED_STATUS,
+    FINISHED_STATUS,
+    PENDING_STATUS,
+    RUNNING_STATUS,
+    check_completed_evals,
+    get_eval_requests,
+    set_eval_request,
+)
 from src.backend.run_eval_suite_lighteval import run_evaluation
 from src.backend.sort_queue import sort_models_by_priority
+from src.envs import (
+    ACCELERATOR,
+    API,
+    EVAL_REQUESTS_PATH_BACKEND,
+    EVAL_RESULTS_PATH_BACKEND,
+    LIMIT,
+    QUEUE_REPO,
+    REGION,
+    RESULTS_REPO,
+    TASKS_LIGHTEVAL,
+    TOKEN,
+    VENDOR,
+)
 from src.logging import setup_logger
+logging.getLogger("openai").setLevel(logging.WARNING)
 logger = setup_logger(__name__)
 # logging.basicConfig(level=logging.ERROR)
 pp = pprint.PrettyPrinter(width=80)
+snapshot_download(
+    repo_id=RESULTS_REPO,
+    revision="main",
+    local_dir=EVAL_RESULTS_PATH_BACKEND,
+    repo_type="dataset",
+    max_workers=60,
+    token=TOKEN,
+)
+snapshot_download(
+    repo_id=QUEUE_REPO,
+    revision="main",
+    local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    repo_type="dataset",
+    max_workers=60,
+    token=TOKEN,
+)
 def run_auto_eval():
     current_pending_status = [PENDING_STATUS]
         hf_repo=QUEUE_REPO,
         local_dir=EVAL_REQUESTS_PATH_BACKEND,
         hf_repo_results=RESULTS_REPO,
+        local_dir_results=EVAL_RESULTS_PATH_BACKEND,
     )
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
+    eval_requests = get_eval_requests(
+        job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
+    )
     # Sort the evals by priority (first submitted first run)
     eval_requests = sort_models_by_priority(api=API, models=eval_requests)
     eval_request = eval_requests[0]
     logger.info(pp.pformat(eval_request))
     set_eval_request(
         api=API,
         eval_request=eval_request,
     )
     # This needs to be done
+    # instance_size, instance_type = get_instance_for_model(eval_request)
     # For GPU
+    # instance_size, instance_type = "small", "g4dn.xlarge"
     # For CPU
     # Updated naming available at https://huggingface.co/docs/inference-endpoints/pricing
     instance_size, instance_type = "x4", "intel-icl"
+    logger.info(
+        f"Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
+    )
     run_evaluation(
+        eval_request=eval_request,
+        task_names=TASKS_LIGHTEVAL,
         local_dir=EVAL_RESULTS_PATH_BACKEND,
+        batch_size=1,
+        accelerator=ACCELERATOR,
+        region=REGION,
+        vendor=VENDOR,
+        instance_size=instance_size,
+        instance_type=instance_type,
+        limit=LIMIT,
+    )
+    logger.info(
+        f"Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
+    )
 if __name__ == "__main__":
+    run_auto_eval()

pyproject.toml CHANGED Viewed

@@ -1,13 +1,10 @@
 [tool.ruff]
-# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
-select = ["E", "F"]
-ignore = ["E501"] # line too long (black is taking care of this)
 line-length = 119
-fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
-[tool.isort]
-profile = "black"
-line_length = 119
-[tool.black]
-line-length = 119

 [tool.ruff]
 line-length = 119
+[tool.ruff.lint]
+select = ["C", "E", "F", "I", "W"]
+ignore = ["E501"] # line too long (the formatter is taking care of this)
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+known-local-folder = ["src"]

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
 APScheduler==3.10.1
-black==23.11.0
 click==8.1.3
 huggingface-hub>=0.18.0
 python-dateutil==2.8.2

 APScheduler==3.10.1
 click==8.1.3
 huggingface-hub>=0.18.0
 python-dateutil==2.8.2

scripts/create_request_file.py CHANGED Viewed

@@ -7,7 +7,9 @@ from datetime import datetime, timezone
 import click
 from colorama import Fore
 from huggingface_hub import HfApi, snapshot_download
-from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
 precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
 model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
@@ -34,7 +36,9 @@ def get_model_size(model_info, precision: str):
 def main():
     api = HfApi()
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
     model_name = click.prompt("Enter model name")
     revision = click.prompt("Enter revision", default="main")

 import click
 from colorama import Fore
 from huggingface_hub import HfApi, snapshot_download
+from src.envs import EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
 precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
 model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
 def main():
     api = HfApi()
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    snapshot_download(
+        repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
+    )
     model_name = click.prompt("Enter model name")
     revision = click.prompt("Enter revision", default="main")

scripts/fix_harness_import.py CHANGED Viewed

@@ -2,10 +2,12 @@
 It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
 It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
 """
 import os
 import lm_eval
 if __name__ == "__main__":
     lm_eval_path = lm_eval.__path__[0]
-    os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)

 It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
 It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
 """
 import os
 import lm_eval
 if __name__ == "__main__":
     lm_eval_path = lm_eval.__path__[0]
+    os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)

src/backend/manage_requests.py CHANGED Viewed

@@ -4,9 +4,11 @@ from dataclasses import dataclass
 from typing import Optional
 from huggingface_hub import HfApi, snapshot_download
 from src.envs import TOKEN
 from src.logging import setup_logger
 logger = setup_logger(__name__)
 PENDING_STATUS = "PENDING"
@@ -14,27 +16,30 @@ RUNNING_STATUS = "RUNNING"
 FINISHED_STATUS = "FINISHED"
 FAILED_STATUS = "FAILED"
 @dataclass
 class EvalRequest:
-    """This class represents one evaluation request file.
-    """
     model: str
     status: str
     json_filepath: str
     weight_type: str = "Original"
     model_type: str = ""  # pretrained, finetuned, with RL
     precision: str = ""  # float16, bfloat16
-    revision: str = "main" # commit hash
-    submitted_time: Optional[str] = "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
-    model_type: Optional[str] = None # pretrained, fine-tuned, etc - define your own categories in
     likes: Optional[int] = 0
     params: Optional[int] = None
     license: Optional[str] = ""
     base_model: Optional[str] = ""
     private: Optional[bool] = False
     def get_model_args(self):
-        """Edit this function if you want to manage more complex quantization issues. You'll need to map it to
         the evaluation suite you chose.
         """
         model_args = f"pretrained={self.model},revision={self.revision}"
@@ -45,7 +50,7 @@ class EvalRequest:
         # Quantized models need some added config, the install of bits and bytes, etc
         else:
             raise Exception(f"Unknown precision {self.precision}.")
         return model_args
@@ -77,7 +82,9 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
     Returns:
         `list[EvalRequest]`: a list of model info dicts.
     """
-    snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN)
     json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
     eval_requests = []
@@ -102,6 +109,7 @@ def eval_was_running(eval_request: EvalRequest):
     status = data["status"]
     return status == RUNNING_STATUS
 def check_completed_evals(
     api: HfApi,
     hf_repo: str,
@@ -114,12 +122,12 @@ def check_completed_evals(
 ):
     """Checks if the currently running evals are completed, if yes, update their status on the hub."""
     snapshot_download(
-        repo_id=hf_repo_results,
-        revision="main",
-        local_dir=local_dir_results,
-        repo_type="dataset",
-        max_workers=60,
-        token=TOKEN
     )
     running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
@@ -134,13 +142,9 @@ def check_completed_evals(
         output_file_exists = len(glob.glob(output_file)) > 0
         if output_file_exists:
-            logger.info(
-                f"EXISTS output file exists for {model} setting it to {completed_status}"
-            )
             set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
         else:
             if eval_was_running(eval_request=eval_request):
-                logger.info(
-                    f"No result file found for {model} setting it to {failed_status}"
-                )
                 set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

 from typing import Optional
 from huggingface_hub import HfApi, snapshot_download
 from src.envs import TOKEN
 from src.logging import setup_logger
 logger = setup_logger(__name__)
 PENDING_STATUS = "PENDING"
 FINISHED_STATUS = "FINISHED"
 FAILED_STATUS = "FAILED"
 @dataclass
 class EvalRequest:
+    """This class represents one evaluation request file."""
     model: str
     status: str
     json_filepath: str
     weight_type: str = "Original"
     model_type: str = ""  # pretrained, finetuned, with RL
     precision: str = ""  # float16, bfloat16
+    revision: str = "main"  # commit hash
+    submitted_time: Optional[str] = (
+        "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
+    )
+    model_type: Optional[str] = None  # pretrained, fine-tuned, etc - define your own categories in
     likes: Optional[int] = 0
     params: Optional[int] = None
     license: Optional[str] = ""
     base_model: Optional[str] = ""
     private: Optional[bool] = False
     def get_model_args(self):
+        """Edit this function if you want to manage more complex quantization issues. You'll need to map it to
         the evaluation suite you chose.
         """
         model_args = f"pretrained={self.model},revision={self.revision}"
         # Quantized models need some added config, the install of bits and bytes, etc
         else:
             raise Exception(f"Unknown precision {self.precision}.")
         return model_args
     Returns:
         `list[EvalRequest]`: a list of model info dicts.
     """
+    snapshot_download(
+        repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
+    )
     json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
     eval_requests = []
     status = data["status"]
     return status == RUNNING_STATUS
 def check_completed_evals(
     api: HfApi,
     hf_repo: str,
 ):
     """Checks if the currently running evals are completed, if yes, update their status on the hub."""
     snapshot_download(
+        repo_id=hf_repo_results,
+        revision="main",
+        local_dir=local_dir_results,
+        repo_type="dataset",
+        max_workers=60,
+        token=TOKEN,
     )
     running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
         output_file_exists = len(glob.glob(output_file)) > 0
         if output_file_exists:
+            logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
             set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
         else:
             if eval_was_running(eval_request=eval_request):
+                logger.info(f"No result file found for {model} setting it to {failed_status}")
                 set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

src/backend/run_eval_suite_harness.py CHANGED Viewed

@@ -1,21 +1,32 @@
 import json
-import os
 import logging
 from datetime import datetime
-from lm_eval import tasks, evaluator, utils
 from lm_eval.tasks import TaskManager
-from src.envs import RESULTS_REPO, API
 from src.backend.manage_requests import EvalRequest
 from src.logging import setup_logger
-from typing import Union
 logging.getLogger("openai").setLevel(logging.WARNING)
 logger = setup_logger(__name__)
-def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: Union[int, str], device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
     """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
     Args:
@@ -51,7 +62,7 @@ def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int
         batch_size=batch_size,
         device=device,
         limit=limit,
-        write_out=True # Whether to write out an example document and model input, for checking task integrity
     )
     results["config"]["model_dtype"] = eval_request.precision

 import json
 import logging
+import os
 from datetime import datetime
+from typing import Union
+from lm_eval import evaluator, utils
 from lm_eval.tasks import TaskManager
 from src.backend.manage_requests import EvalRequest
+from src.envs import API
 from src.logging import setup_logger
 logging.getLogger("openai").setLevel(logging.WARNING)
 logger = setup_logger(__name__)
+def run_evaluation(
+    eval_request: EvalRequest,
+    task_names: list,
+    num_fewshot: int,
+    batch_size: Union[int, str],
+    device: str,
+    local_dir: str,
+    results_repo: str,
+    no_cache: bool = True,
+    limit: int = None,
+):
     """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
     Args:
         batch_size=batch_size,
         device=device,
         limit=limit,
+        write_out=True,  # Whether to write out an example document and model input, for checking task integrity
     )
     results["config"]["model_dtype"] = eval_request.precision

src/backend/run_eval_suite_lighteval.py CHANGED Viewed

@@ -1,23 +1,31 @@
 import json
-import argparse
 import logging
-from datetime import datetime
-import lighteval
 from lighteval.logging.evaluation_tracker import EvaluationTracker
 from lighteval.models.model_config import InferenceEndpointModelConfig
 from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
-from lighteval.main_accelerate import main, EnvConfig, create_model_config
-from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
 from src.backend.manage_requests import EvalRequest
 from src.logging import setup_logger
 logging.getLogger("openai").setLevel(logging.WARNING)
 logger = setup_logger(__name__)
-def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
     """Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
     Args:
@@ -32,18 +40,20 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
         local_dir (str): Where to save the results locally
         no_cache (bool, optional): Whether to use a cache or not.
         limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
-    """
     if limit:
-        logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
     evaluation_tracker = EvaluationTracker(
         output_dir="./results",
-        save_details = True,
-        push_to_hub = True,
-        push_to_tensorboard = False,
-        hub_results_org= RESULTS_REPO,
-        public = False,
     )
     pipeline_params = PipelineParameters(
@@ -52,21 +62,21 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
         max_samples=limit,
         use_chat_template=False,
         system_prompt=None,
-        custom_tasks_directory="custom_tasks.py", # if using a custom task
     )
     model_config = InferenceEndpointModelConfig(
         # Endpoint parameters
-        name = eval_request.model.replace(".", "-").lower(),
-        repository = eval_request.model,
-        accelerator =  accelerator,
-        vendor= vendor,
-        region= region,
-        instance_size= instance_size,
-        instance_type= instance_type,
-        should_reuse_existing= False,
-        model_dtype= eval_request.precision,
-        revision= eval_request.revision,
     )
     pipeline = Pipeline(
@@ -85,7 +95,7 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
         dumped = json.dumps(results, indent=2)
         logger.info(dumped)
-    except Exception as e: # if eval failed, we force a cleanup
         pipeline.model.cleanup()
     return results

 import json
 import logging
 from lighteval.logging.evaluation_tracker import EvaluationTracker
 from lighteval.models.model_config import InferenceEndpointModelConfig
 from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
 from src.backend.manage_requests import EvalRequest
+from src.envs import RESULTS_REPO
 from src.logging import setup_logger
 logging.getLogger("openai").setLevel(logging.WARNING)
 logger = setup_logger(__name__)
+def run_evaluation(
+    eval_request: EvalRequest,
+    task_names: str,
+    batch_size: int,
+    local_dir: str,
+    accelerator: str,
+    region: str,
+    vendor: str,
+    instance_size: str,
+    instance_type: str,
+    limit=None,
+):
     """Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
     Args:
         local_dir (str): Where to save the results locally
         no_cache (bool, optional): Whether to use a cache or not.
         limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
+    """
     if limit:
+        logger.info(
+            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
     evaluation_tracker = EvaluationTracker(
         output_dir="./results",
+        save_details=True,
+        push_to_hub=True,
+        push_to_tensorboard=False,
+        hub_results_org=RESULTS_REPO,
+        public=False,
     )
     pipeline_params = PipelineParameters(
         max_samples=limit,
         use_chat_template=False,
         system_prompt=None,
+        custom_tasks_directory="custom_tasks.py",  # if using a custom task
     )
     model_config = InferenceEndpointModelConfig(
         # Endpoint parameters
+        name=eval_request.model.replace(".", "-").lower(),
+        repository=eval_request.model,
+        accelerator=accelerator,
+        vendor=vendor,
+        region=region,
+        instance_size=instance_size,
+        instance_type=instance_type,
+        should_reuse_existing=False,
+        model_dtype=eval_request.precision,
+        revision=eval_request.revision,
     )
     pipeline = Pipeline(
         dumped = json.dumps(results, indent=2)
         logger.info(dumped)
+    except Exception:  # if eval failed, we force a cleanup
         pipeline.model.cleanup()
     return results

src/backend/sort_queue.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import re
 from dataclasses import dataclass
 from huggingface_hub import HfApi
@@ -11,6 +10,7 @@ class ModelMetadata:
     likes: int = 0
     size: int = 15
 # All the functions below sort the models in the queue based on different parameters
 def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
     private_models = [model for model in models if model.private]
@@ -18,11 +18,14 @@ def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalR
     return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
 def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
     return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
 def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
     return sorted(eval_requests, key=lambda x: x.params, reverse=False)
 def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
-    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

 from dataclasses import dataclass
 from huggingface_hub import HfApi
     likes: int = 0
     size: int = 15
 # All the functions below sort the models in the queue based on different parameters
 def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
     private_models = [model for model in models if model.private]
     return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
 def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
     return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
 def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
     return sorted(eval_requests, key=lambda x: x.params, reverse=False)
 def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

src/display/log_visualizer.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from io import StringIO
-from pathlib import Path
 from bs4 import BeautifulSoup
 from rich.console import Console
@@ -12,8 +11,8 @@ from src.logging import log_file
 def log_file_to_html_string(reverse=True):
     with open(log_file, "rt") as f:
-            lines = f.readlines()
-            lines = lines[-NUM_LINES_VISUALIZE:]
     if reverse:
         lines = reversed(lines)
@@ -26,12 +25,12 @@ def log_file_to_html_string(reverse=True):
     html_content = console.export_html(inline_styles=True)
     # Parse the HTML content using BeautifulSoup
-    soup = BeautifulSoup(html_content, 'lxml')
     # Modify the <pre> tag and add custom styles
     pre_tag = soup.pre
-    pre_tag['class'] = 'scrollable'
-    del pre_tag['style']
     # Add your custom styles and the .scrollable CSS to the <style> tag
     style_tag = soup.style

 from io import StringIO
 from bs4 import BeautifulSoup
 from rich.console import Console
 def log_file_to_html_string(reverse=True):
     with open(log_file, "rt") as f:
+        lines = f.readlines()
+        lines = lines[-NUM_LINES_VISUALIZE:]
     if reverse:
         lines = reversed(lines)
     html_content = console.export_html(inline_styles=True)
     # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html_content, "lxml")
     # Modify the <pre> tag and add custom styles
     pre_tag = soup.pre
+    pre_tag["class"] = "scrollable"
+    del pre_tag["style"]
     # Add your custom styles and the .scrollable CSS to the <style> tag
     style_tag = soup.style

src/envs.py CHANGED Viewed

@@ -2,23 +2,24 @@ import os
 from huggingface_hub import HfApi
 # Info to change for your repository
 # ----------------------------------
-TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset
 # For harness evaluations
-DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
-LIMIT = 20 # !!!! For testing, should be None for actual evaluations!!!
-NUM_FEWSHOT = 0 # Change with your few shot for the Harness evaluations
 TASKS_HARNESS = ["anli_r1", "logiqa"]
 # For lighteval evaluations
 ACCELERATOR = "cpu"
 REGION = "us-east-1"
 VENDOR = "aws"
-TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
 # To add your own tasks, edit the custom file and launch it with `custom|myothertask|0|0``
 # ---------------------------------------------------
@@ -27,7 +28,7 @@ QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
-CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
@@ -39,4 +40,3 @@ REFRESH_RATE = 10 * 60  # 10 min
 NUM_LINES_VISUALIZE = 300
 API = HfApi(token=TOKEN)

 from huggingface_hub import HfApi
 # Info to change for your repository
 # ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN")  # A read/write token for your org
+OWNER = "demo-leaderboard-backend"  # Change to your org - don't forget to create a results and request dataset
 # For harness evaluations
+DEVICE = "cpu"  # "cuda:0" if you add compute, for harness evaluations
+LIMIT = 20  # !!!! For testing, should be None for actual evaluations!!!
+NUM_FEWSHOT = 0  # Change with your few shot for the Harness evaluations
 TASKS_HARNESS = ["anli_r1", "logiqa"]
 # For lighteval evaluations
 ACCELERATOR = "cpu"
 REGION = "us-east-1"
 VENDOR = "aws"
+TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
 # To add your own tasks, edit the custom file and launch it with `custom|myothertask|0|0``
 # ---------------------------------------------------
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
+CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 NUM_LINES_VISUALIZE = 300
 API = HfApi(token=TOKEN)

src/logging.py CHANGED Viewed

@@ -1,19 +1,17 @@
-import sys
 from pathlib import Path
-proj_dir = Path(__file__).parents[1]
-log_file = proj_dir/"output.log"
-import logging
 def setup_logger(name: str):
     logger = logging.getLogger(name)
     logger.setLevel(logging.INFO)
-    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     # Create a file handler to write logs to a file
     file_handler = logging.FileHandler(log_file)
@@ -29,10 +27,10 @@ def configure_root_logger():
     logging.basicConfig(level=logging.INFO)
     root_logger = logging.getLogger()
-    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     file_handler = logging.FileHandler(log_file)
     file_handler.setLevel(logging.INFO)
     file_handler.setFormatter(formatter)
-    root_logger.addHandler(file_handler)

+import logging
 from pathlib import Path
+proj_dir = Path(__file__).parents[1]
+log_file = proj_dir / "output.log"
 def setup_logger(name: str):
     logger = logging.getLogger(name)
     logger.setLevel(logging.INFO)
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
     # Create a file handler to write logs to a file
     file_handler = logging.FileHandler(log_file)
     logging.basicConfig(level=logging.INFO)
     root_logger = logging.getLogger()
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
     file_handler = logging.FileHandler(log_file)
     file_handler.setLevel(logging.INFO)
     file_handler.setFormatter(formatter)
+    root_logger.addHandler(file_handler)