import evaluate import lm_eval from typing import Union, List, Optional from dmx.compressor.dmx import config_rules, DmxModel import datasets import torch _DESCRIPTION = """ Evaluation function using lm-eval with d-Matrix integration. This function allows for the evaluation of language models across various tasks, with the option to use d-Matrix compressed models. """ _KWARGS_DESCRIPTION = """ Args: model (str): The name or path of the model to evaluate. tasks (Union[str, List[str]]): The task or list of tasks to evaluate on. dmx_config (Optional[str]): Configuration string for d-Matrix transformations, defaults to None. num_fewshot (Optional[int]): Number of examples in few-shot context, defaults to None. batch_size (Optional[Union[int, str]]): Batch size for model, defaults to None. max_batch_size (Optional[int]): Maximum batch size to try with automatic batch size detection, defaults to None. limit (Optional[Union[int, float]]): Limit the number of examples per task, defaults to None. device (Optional[str]): Device to run on. If None, defaults to 'cuda' if available, otherwise 'cpu'. revision (str): Model revision to use, defaults to 'main'. trust_remote_code (bool): Whether to trust remote code, defaults to False. log_samples (bool): If True, logs all model outputs and documents, defaults to True. verbosity (str): Logging verbosity level, defaults to 'INFO'. **kwargs: Additional keyword arguments to pass to `lm_eval.evaluate`. Returns: dict: A dictionary containing the evaluation results. """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class DmxMetric(evaluate.Metric): def _info(self): return evaluate.MetricInfo( module_type="metric", description=_DESCRIPTION, citation="", inputs_description=_KWARGS_DESCRIPTION, features=datasets.Features( { "references": datasets.Value("string"), } ), reference_urls=["https://github.com/EleutherAI/lm-evaluation-harness"], ) def _compute( self, model: str, tasks: Union[str, List[str]], dmx_config: Optional[str] = None, num_fewshot: Optional[int] = None, batch_size: Optional[Union[int, str]] = None, max_batch_size: Optional[int] = None, limit: Optional[Union[int, float]] = None, device: Optional[str] = None, revision: str = "main", trust_remote_code: bool = False, log_samples: bool = True, verbosity: str = "INFO", **kwargs ): """ Evaluate a model on multiple tasks and metrics using lm-eval with optional d-Matrix integration. """ if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" model_args = f"pretrained={model},revision={revision},trust_remote_code={str(trust_remote_code)},device={device}" lm = lm_eval.api.registry.get_model("hf").create_from_arg_string( model_args, { "batch_size": batch_size, "max_batch_size": max_batch_size, } ) if dmx_config: lm._model = DmxModel.from_torch(lm._model) lm._model.transform(lm._model.dmx_config, *eval(f"config_rules.{dmx_config}")) task_dict = lm_eval.tasks.get_task_dict(tasks if isinstance(tasks, list) else [tasks]) for task in task_dict.values(): if num_fewshot is not None: task.set_config(key="num_fewshot", value=num_fewshot) eval_params = { 'lm': lm, 'task_dict': task_dict, 'limit': limit, 'log_samples': log_samples, 'verbosity': verbosity, **kwargs } results = lm_eval.evaluate(**eval_params) return results.get('results', {})