File size: 4,029 Bytes
c401996 221fdce c401996 f383068 c401996 f383068 c401996 221fdce c401996 221fdce c401996 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import evaluate
import lm_eval
from typing import Union, List, Optional
from dmx.compressor.dmx import config_rules, DmxModel
import datasets
import torch
_DESCRIPTION = """
Evaluation function using lm-eval with d-Matrix integration.
This function allows for the evaluation of language models across various tasks,
with the option to use d-Matrix compressed models.
"""
_KWARGS_DESCRIPTION = """
Args:
model (str): The name or path of the model to evaluate.
tasks (Union[str, List[str]]): The task or list of tasks to evaluate on.
dmx_config (Optional[str]): Configuration string for d-Matrix transformations, defaults to None.
num_fewshot (Optional[int]): Number of examples in few-shot context, defaults to None.
batch_size (Optional[Union[int, str]]): Batch size for model, defaults to None.
max_batch_size (Optional[int]): Maximum batch size to try with automatic batch size detection, defaults to None.
limit (Optional[Union[int, float]]): Limit the number of examples per task, defaults to None.
device (Optional[str]): Device to run on. If None, defaults to 'cuda' if available, otherwise 'cpu'.
revision (str): Model revision to use, defaults to 'main'.
trust_remote_code (bool): Whether to trust remote code, defaults to False.
log_samples (bool): If True, logs all model outputs and documents, defaults to True.
verbosity (str): Logging verbosity level, defaults to 'INFO'.
**kwargs: Additional keyword arguments to pass to `lm_eval.evaluate`.
Returns:
dict: A dictionary containing the evaluation results.
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class DmxMetric(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
module_type="metric",
description=_DESCRIPTION,
citation="",
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"references": datasets.Value("string"),
}
),
reference_urls=["https://github.com/EleutherAI/lm-evaluation-harness"],
)
def _compute(
self,
model: str,
tasks: Union[str, List[str]],
dmx_config: Optional[str] = None,
num_fewshot: Optional[int] = None,
batch_size: Optional[Union[int, str]] = None,
max_batch_size: Optional[int] = None,
limit: Optional[Union[int, float]] = None,
device: Optional[str] = None,
revision: str = "main",
trust_remote_code: bool = False,
log_samples: bool = True,
verbosity: str = "INFO",
**kwargs
):
"""
Evaluate a model on multiple tasks and metrics using lm-eval with optional d-Matrix integration.
"""
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_args = f"pretrained={model},revision={revision},trust_remote_code={str(trust_remote_code)},device={device}"
lm = lm_eval.api.registry.get_model("hf").create_from_arg_string(
model_args,
{
"batch_size": batch_size,
"max_batch_size": max_batch_size,
}
)
if dmx_config:
lm._model = DmxModel.from_torch(lm._model)
lm._model.transform(lm._model.dmx_config, *eval(f"config_rules.{dmx_config}"))
task_dict = lm_eval.tasks.get_task_dict(tasks if isinstance(tasks, list) else [tasks])
for task in task_dict.values():
if num_fewshot is not None:
task.set_config(key="num_fewshot", value=num_fewshot)
eval_params = {
'lm': lm,
'task_dict': task_dict,
'limit': limit,
'log_samples': log_samples,
'verbosity': verbosity,
**kwargs
}
results = lm_eval.evaluate(**eval_params)
return results.get('results', {}) |