File size: 3,072 Bytes
a977e3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbe9336
a977e3e
cbe9336
a977e3e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluation of code generation on the APPS benchmark"""

import evaluate
import datasets
from .utils import compute_metrics
from .testing_util import run_test


_CITATION = """\
@article{hendrycksapps2021,
  title={Measuring Coding Challenge Competence With APPS},
  author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
  journal={NeurIPS},
  year={2021}
}
"""


_DESCRIPTION = """\
This is a metric to evaluate code generation using the APPS benchmark "Measuring Coding Challenge Competence With
APPS" (https://arxiv.org/pdf/2105.09938.pdf).
"""


# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Computes Average accuracy and strict accuracy for single generations, and pass@k for multiple generations.
Args:
    predictions: list of code generations to score. It's a list of list(s), each corresponding to a problem from APPS dataset.

Returns:
    metrics: dict of three metrics: average accuracy, stric accuracy, and pass@k.
Examples:
    >>> my_new_module = evaluate.load("loubnabnl/apps_metric")
    >>> results = my_new_module.compute(predictions=[["s=input()\nprint(s)"]])
    >>> print(results)
    {'avg_accuracy': 0, 'strict_accuracy': 0, 'pass_at_k': None}
"""




@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class apps_metric(evaluate.EvaluationModule):
    """Evaluate code generation on APPS benchmark. 
    The generations are compiled and their corresponding unit tests are run"""

    def _info(self):

        return evaluate.EvaluationModuleInfo(

            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,

            features=datasets.Features({
                'predictions': datasets.Sequence(datasets.Value("string")),
            }),
            homepage="https://github.com/hendrycks/apps",
            reference_urls=["https://huggingface.co/datasets/codeparrot/apps"]
        )



    def _compute(self, predictions, indices=None, k_list=[1, 10, 100], count_errors=True, level="all", debug=False):
        """Returns the scores"""
        metrics = compute_metrics(predictions, indices=indices, k_list=k_list, count_errors=count_errors, level=level, debug=debug)
        return metrics