Spaces:

d-matrix
/

dmxMetric

Sleeping

App Files Files Community

dmxMetric / dmxMetric.py

wanzin

change verbosity and device descriptions

f383068 about 1 month ago

raw

history blame contribute delete

No virus

4.03 kB

	import evaluate
	import lm_eval
	from typing import Union, List, Optional
	from dmx.compressor.dmx import config_rules, DmxModel
	import datasets
	import torch

	_DESCRIPTION = """
	Evaluation function using lm-eval with d-Matrix integration.
	This function allows for the evaluation of language models across various tasks,
	with the option to use d-Matrix compressed models.
	"""

	_KWARGS_DESCRIPTION = """
	Args:
	model (str): The name or path of the model to evaluate.
	tasks (Union[str, List[str]]): The task or list of tasks to evaluate on.
	dmx_config (Optional[str]): Configuration string for d-Matrix transformations, defaults to None.
	num_fewshot (Optional[int]): Number of examples in few-shot context, defaults to None.
	batch_size (Optional[Union[int, str]]): Batch size for model, defaults to None.
	max_batch_size (Optional[int]): Maximum batch size to try with automatic batch size detection, defaults to None.
	limit (Optional[Union[int, float]]): Limit the number of examples per task, defaults to None.
	device (Optional[str]): Device to run on. If None, defaults to 'cuda' if available, otherwise 'cpu'.
	revision (str): Model revision to use, defaults to 'main'.
	trust_remote_code (bool): Whether to trust remote code, defaults to False.
	log_samples (bool): If True, logs all model outputs and documents, defaults to True.
	verbosity (str): Logging verbosity level, defaults to 'INFO'.
	**kwargs: Additional keyword arguments to pass to `lm_eval.evaluate`.

	Returns:
	dict: A dictionary containing the evaluation results.
	"""

	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class DmxMetric(evaluate.Metric):
	def _info(self):
	return evaluate.MetricInfo(
	module_type="metric",
	description=_DESCRIPTION,
	citation="",
	inputs_description=_KWARGS_DESCRIPTION,
	features=datasets.Features(
	{
	"references": datasets.Value("string"),
	}
	),
	reference_urls=["https://github.com/EleutherAI/lm-evaluation-harness"],
	)

	def _compute(
	self,
	model: str,
	tasks: Union[str, List[str]],
	dmx_config: Optional[str] = None,
	num_fewshot: Optional[int] = None,
	batch_size: Optional[Union[int, str]] = None,
	max_batch_size: Optional[int] = None,
	limit: Optional[Union[int, float]] = None,
	device: Optional[str] = None,
	revision: str = "main",
	trust_remote_code: bool = False,
	log_samples: bool = True,
	verbosity: str = "INFO",
	**kwargs
	):
	"""
	Evaluate a model on multiple tasks and metrics using lm-eval with optional d-Matrix integration.
	"""
	if device is None:
	device = "cuda" if torch.cuda.is_available() else "cpu"

	model_args = f"pretrained={model},revision={revision},trust_remote_code={str(trust_remote_code)},device={device}"

	lm = lm_eval.api.registry.get_model("hf").create_from_arg_string(
	model_args,
	{
	"batch_size": batch_size,
	"max_batch_size": max_batch_size,
	}
	)

	if dmx_config:
	lm._model = DmxModel.from_torch(lm._model)
	lm._model.transform(lm._model.dmx_config, *eval(f"config_rules.{dmx_config}"))

	task_dict = lm_eval.tasks.get_task_dict(tasks if isinstance(tasks, list) else [tasks])

	for task in task_dict.values():
	if num_fewshot is not None:
	task.set_config(key="num_fewshot", value=num_fewshot)

	eval_params = {
	'lm': lm,
	'task_dict': task_dict,
	'limit': limit,
	'log_samples': log_samples,
	'verbosity': verbosity,
	**kwargs
	}

	results = lm_eval.evaluate(**eval_params)
	return results.get('results', {})