loubnabnl HF staff commited on
Commit
c6538f5
1 Parent(s): 117da40

add metric

Browse files
Files changed (1) hide show
  1. apps-metric.py +29 -43
apps-metric.py CHANGED
@@ -15,81 +15,67 @@
15
 
16
  import evaluate
17
  import datasets
 
18
 
19
 
20
- # TODO: Add BibTeX citation
21
  _CITATION = """\
22
- @InProceedings{huggingface:module,
23
- title = {A great new module},
24
- authors={huggingface, Inc.},
25
- year={2020}
 
26
  }
27
  """
28
 
29
- # TODO: Add description of the module here
30
  _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
 
32
  """
33
 
34
 
35
  # TODO: Add description of the arguments of the module here
36
  _KWARGS_DESCRIPTION = """
37
- Calculates how good are predictions given some references, using certain scores
38
  Args:
39
- predictions: list of predictions to score. Each predictions
40
- should be a string with tokens separated by spaces.
41
- references: list of reference for each prediction. Each
42
- reference should be a string with tokens separated by spaces.
43
  Returns:
44
- accuracy: description of the first score,
45
- another_score: description of the second score,
46
  Examples:
47
- Examples should be written in doctest format, and should illustrate how
48
- to use the function.
49
-
50
- >>> my_new_module = evaluate.load("my_new_module")
51
- >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52
  >>> print(results)
53
- {'accuracy': 1.0}
54
  """
55
 
56
- # TODO: Define external resources urls if needed
57
- BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
58
 
59
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
  class apps-metric(evaluate.EvaluationModule):
62
- """TODO: Short description of my evaluation module."""
 
63
 
64
  def _info(self):
65
- # TODO: Specifies the evaluate.EvaluationModuleInfo object
66
  return evaluate.EvaluationModuleInfo(
67
- # This is the description that will appear on the modules page.
68
  module_type="metric",
69
  description=_DESCRIPTION,
70
  citation=_CITATION,
71
  inputs_description=_KWARGS_DESCRIPTION,
72
- # This defines the format of each prediction and reference
73
  features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
  }),
77
- # Homepage of the module for documentation
78
- homepage="http://module.homepage",
79
- # Additional links to the codebase or references
80
- codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
- reference_urls=["http://path.to.reference.url/new_module"]
82
  )
83
 
84
- def _download_and_prepare(self, dl_manager):
85
- """Optional: download external resources useful to compute the scores"""
86
- # TODO: Download external resources if needed
87
- pass
88
 
89
- def _compute(self, predictions, references):
 
90
  """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
93
- return {
94
- "accuracy": accuracy,
95
- }
 
15
 
16
  import evaluate
17
  import datasets
18
+ from utils import compute_metrics
19
 
20
 
 
21
  _CITATION = """\
22
+ @article{hendrycksapps2021,
23
+ title={Measuring Coding Challenge Competence With APPS},
24
+ author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
25
+ journal={NeurIPS},
26
+ year={2021}
27
  }
28
  """
29
 
30
+
31
  _DESCRIPTION = """\
32
+ This is a metric to evaluate code generation using the APPS benchmark "Measuring Coding Challenge Competence With
33
+ APPS" (https://arxiv.org/pdf/2105.09938.pdf).
34
  """
35
 
36
 
37
  # TODO: Add description of the arguments of the module here
38
  _KWARGS_DESCRIPTION = """
39
+ Computes Average accuracy and strict accuracy for single generations, and pass@k for multiple generations.
40
  Args:
41
+ predictions: list of code generations to score. It's a list of list(s), each corresponding to a problem from APPS dataset.
42
+
 
 
43
  Returns:
44
+ metrics: dict of three metrics: average accuracy, stric accuracy, and pass@k.
 
45
  Examples:
46
+ >>> my_new_module = evaluate.load("loubnabnl/apps-metric")
47
+ >>> results = my_new_module.compute(references=["s=inpu()\nprint(s)"])
 
 
 
48
  >>> print(results)
49
+ {'avg_accuracy': 0, 'strict_accuracy': 0, 'pass_at_k': None}
50
  """
51
 
52
+
 
53
 
54
 
55
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
56
  class apps-metric(evaluate.EvaluationModule):
57
+ """Evaluate code generation on APPS benchmark.
58
+ The generations are compiled and their corresponding unit tests are run"""
59
 
60
  def _info(self):
61
+
62
  return evaluate.EvaluationModuleInfo(
63
+
64
  module_type="metric",
65
  description=_DESCRIPTION,
66
  citation=_CITATION,
67
  inputs_description=_KWARGS_DESCRIPTION,
68
+
69
  features=datasets.Features({
70
+ 'predictions': datasets.Sequence(datasets.Value("string")),
 
71
  }),
72
+ homepage="https://github.com/hendrycks/apps",
73
+ reference_urls=["https://huggingface.co/datasets/codeparrot/apps"]
 
 
 
74
  )
75
 
 
 
 
 
76
 
77
+
78
+ def _compute(self, generations, k_list=[1, 10, 100], count_errors=True, level=["all"]):
79
  """Returns the scores"""
80
+ metrics = compute_metrics(generations, k_list=k_list, count_errors=count_errors, level=level)
81
+ return metrics