Spaces:
Running
Running
add metric
Browse files- apps-metric.py +29 -43
apps-metric.py
CHANGED
@@ -15,81 +15,67 @@
|
|
15 |
|
16 |
import evaluate
|
17 |
import datasets
|
|
|
18 |
|
19 |
|
20 |
-
# TODO: Add BibTeX citation
|
21 |
_CITATION = """\
|
22 |
-
@
|
23 |
-
title
|
24 |
-
|
25 |
-
|
|
|
26 |
}
|
27 |
"""
|
28 |
|
29 |
-
|
30 |
_DESCRIPTION = """\
|
31 |
-
This
|
|
|
32 |
"""
|
33 |
|
34 |
|
35 |
# TODO: Add description of the arguments of the module here
|
36 |
_KWARGS_DESCRIPTION = """
|
37 |
-
|
38 |
Args:
|
39 |
-
predictions: list of
|
40 |
-
|
41 |
-
references: list of reference for each prediction. Each
|
42 |
-
reference should be a string with tokens separated by spaces.
|
43 |
Returns:
|
44 |
-
|
45 |
-
another_score: description of the second score,
|
46 |
Examples:
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
>>> my_new_module = evaluate.load("my_new_module")
|
51 |
-
>>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
|
52 |
>>> print(results)
|
53 |
-
{'
|
54 |
"""
|
55 |
|
56 |
-
|
57 |
-
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
|
58 |
|
59 |
|
60 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
61 |
class apps-metric(evaluate.EvaluationModule):
|
62 |
-
"""
|
|
|
63 |
|
64 |
def _info(self):
|
65 |
-
|
66 |
return evaluate.EvaluationModuleInfo(
|
67 |
-
|
68 |
module_type="metric",
|
69 |
description=_DESCRIPTION,
|
70 |
citation=_CITATION,
|
71 |
inputs_description=_KWARGS_DESCRIPTION,
|
72 |
-
|
73 |
features=datasets.Features({
|
74 |
-
'predictions': datasets.Value(
|
75 |
-
'references': datasets.Value('int64'),
|
76 |
}),
|
77 |
-
|
78 |
-
|
79 |
-
# Additional links to the codebase or references
|
80 |
-
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
|
81 |
-
reference_urls=["http://path.to.reference.url/new_module"]
|
82 |
)
|
83 |
|
84 |
-
def _download_and_prepare(self, dl_manager):
|
85 |
-
"""Optional: download external resources useful to compute the scores"""
|
86 |
-
# TODO: Download external resources if needed
|
87 |
-
pass
|
88 |
|
89 |
-
|
|
|
90 |
"""Returns the scores"""
|
91 |
-
|
92 |
-
|
93 |
-
return {
|
94 |
-
"accuracy": accuracy,
|
95 |
-
}
|
|
|
15 |
|
16 |
import evaluate
|
17 |
import datasets
|
18 |
+
from utils import compute_metrics
|
19 |
|
20 |
|
|
|
21 |
_CITATION = """\
|
22 |
+
@article{hendrycksapps2021,
|
23 |
+
title={Measuring Coding Challenge Competence With APPS},
|
24 |
+
author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
|
25 |
+
journal={NeurIPS},
|
26 |
+
year={2021}
|
27 |
}
|
28 |
"""
|
29 |
|
30 |
+
|
31 |
_DESCRIPTION = """\
|
32 |
+
This is a metric to evaluate code generation using the APPS benchmark "Measuring Coding Challenge Competence With
|
33 |
+
APPS" (https://arxiv.org/pdf/2105.09938.pdf).
|
34 |
"""
|
35 |
|
36 |
|
37 |
# TODO: Add description of the arguments of the module here
|
38 |
_KWARGS_DESCRIPTION = """
|
39 |
+
Computes Average accuracy and strict accuracy for single generations, and pass@k for multiple generations.
|
40 |
Args:
|
41 |
+
predictions: list of code generations to score. It's a list of list(s), each corresponding to a problem from APPS dataset.
|
42 |
+
|
|
|
|
|
43 |
Returns:
|
44 |
+
metrics: dict of three metrics: average accuracy, stric accuracy, and pass@k.
|
|
|
45 |
Examples:
|
46 |
+
>>> my_new_module = evaluate.load("loubnabnl/apps-metric")
|
47 |
+
>>> results = my_new_module.compute(references=["s=inpu()\nprint(s)"])
|
|
|
|
|
|
|
48 |
>>> print(results)
|
49 |
+
{'avg_accuracy': 0, 'strict_accuracy': 0, 'pass_at_k': None}
|
50 |
"""
|
51 |
|
52 |
+
|
|
|
53 |
|
54 |
|
55 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
56 |
class apps-metric(evaluate.EvaluationModule):
|
57 |
+
"""Evaluate code generation on APPS benchmark.
|
58 |
+
The generations are compiled and their corresponding unit tests are run"""
|
59 |
|
60 |
def _info(self):
|
61 |
+
|
62 |
return evaluate.EvaluationModuleInfo(
|
63 |
+
|
64 |
module_type="metric",
|
65 |
description=_DESCRIPTION,
|
66 |
citation=_CITATION,
|
67 |
inputs_description=_KWARGS_DESCRIPTION,
|
68 |
+
|
69 |
features=datasets.Features({
|
70 |
+
'predictions': datasets.Sequence(datasets.Value("string")),
|
|
|
71 |
}),
|
72 |
+
homepage="https://github.com/hendrycks/apps",
|
73 |
+
reference_urls=["https://huggingface.co/datasets/codeparrot/apps"]
|
|
|
|
|
|
|
74 |
)
|
75 |
|
|
|
|
|
|
|
|
|
76 |
|
77 |
+
|
78 |
+
def _compute(self, generations, k_list=[1, 10, 100], count_errors=True, level=["all"]):
|
79 |
"""Returns the scores"""
|
80 |
+
metrics = compute_metrics(generations, k_list=k_list, count_errors=count_errors, level=level)
|
81 |
+
return metrics
|
|
|
|
|
|