File size: 4,029 Bytes
c401996
 
 
 
 
221fdce
c401996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f383068
c401996
 
 
f383068
c401996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221fdce
c401996
 
 
 
 
 
 
 
 
221fdce
 
 
 
c401996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import evaluate
import lm_eval
from typing import Union, List, Optional
from dmx.compressor.dmx import config_rules, DmxModel
import datasets
import torch

_DESCRIPTION = """
Evaluation function using lm-eval with d-Matrix integration.
This function allows for the evaluation of language models across various tasks, 
with the option to use d-Matrix compressed models.
"""

_KWARGS_DESCRIPTION = """
Args:
    model (str): The name or path of the model to evaluate.
    tasks (Union[str, List[str]]): The task or list of tasks to evaluate on.
    dmx_config (Optional[str]): Configuration string for d-Matrix transformations, defaults to None.
    num_fewshot (Optional[int]): Number of examples in few-shot context, defaults to None.
    batch_size (Optional[Union[int, str]]): Batch size for model, defaults to None.
    max_batch_size (Optional[int]): Maximum batch size to try with automatic batch size detection, defaults to None.
    limit (Optional[Union[int, float]]): Limit the number of examples per task, defaults to None.
    device (Optional[str]): Device to run on. If None, defaults to 'cuda' if available, otherwise 'cpu'.
    revision (str): Model revision to use, defaults to 'main'.
    trust_remote_code (bool): Whether to trust remote code, defaults to False.
    log_samples (bool): If True, logs all model outputs and documents, defaults to True.
    verbosity (str): Logging verbosity level, defaults to 'INFO'.
    **kwargs: Additional keyword arguments to pass to `lm_eval.evaluate`.

Returns:
    dict: A dictionary containing the evaluation results.
"""

@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class  DmxMetric(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            module_type="metric",
            description=_DESCRIPTION,
            citation="",
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "references": datasets.Value("string"),
                }
            ),
            reference_urls=["https://github.com/EleutherAI/lm-evaluation-harness"],
        )

    def _compute(
        self,
        model: str,
        tasks: Union[str, List[str]],
        dmx_config: Optional[str] = None,
        num_fewshot: Optional[int] = None,
        batch_size: Optional[Union[int, str]] = None,
        max_batch_size: Optional[int] = None,
        limit: Optional[Union[int, float]] = None,
        device: Optional[str] = None,
        revision: str = "main",
        trust_remote_code: bool = False, 
        log_samples: bool = True,
        verbosity: str = "INFO",
        **kwargs
    ):
        """
        Evaluate a model on multiple tasks and metrics using lm-eval with optional d-Matrix integration.
        """
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        
        model_args = f"pretrained={model},revision={revision},trust_remote_code={str(trust_remote_code)},device={device}"

        lm = lm_eval.api.registry.get_model("hf").create_from_arg_string(
            model_args,
            {
                "batch_size": batch_size,
                "max_batch_size": max_batch_size,
            }
        )
        
        if dmx_config:
            lm._model = DmxModel.from_torch(lm._model)
            lm._model.transform(lm._model.dmx_config, *eval(f"config_rules.{dmx_config}"))
        
        task_dict = lm_eval.tasks.get_task_dict(tasks if isinstance(tasks, list) else [tasks])
    
        for task in task_dict.values():
            if num_fewshot is not None:
                task.set_config(key="num_fewshot", value=num_fewshot)
        
        eval_params = {
            'lm': lm,
            'task_dict': task_dict,
            'limit': limit,
            'log_samples': log_samples,
            'verbosity': verbosity,
            **kwargs
        }
        
        results = lm_eval.evaluate(**eval_params)
        return results.get('results', {})