File size: 2,977 Bytes
2a517dd
 
 
 
 
 
 
 
 
 
 
 
 
c7245a9
2a517dd
 
 
2738707
2a517dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluation of code generation on the APPS benchmark"""

import evaluate
import datasets
from utils import compute_metrics


_CITATION = """\
@article{hendrycksapps2021,
  title={Measuring Coding Challenge Competence With APPS},
  author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
  journal={NeurIPS},
  year={2021}
}
"""


_DESCRIPTION = """\
This is a metric to evaluate code generation using the APPS benchmark "Measuring Coding Challenge Competence With
APPS" (https://arxiv.org/pdf/2105.09938.pdf).
"""


# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Computes Average accuracy and strict accuracy for single generations, and pass@k for multiple generations.
Args:
    predictions: list of code generations to score. It's a list of list(s), each corresponding to a problem from APPS dataset.

Returns:
    metrics: dict of three metrics: average accuracy, stric accuracy, and pass@k.
Examples:
    >>> my_new_module = evaluate.load("loubnabnl/apps_metric")
    >>> results = my_new_module.compute(references=["s=inpu()\nprint(s)"])
    >>> print(results)
    {'avg_accuracy': 0, 'strict_accuracy': 0, 'pass_at_k': None}
"""




@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class apps_metric(evaluate.EvaluationModule):
    """Evaluate code generation on APPS benchmark. 
    The generations are compiled and their corresponding unit tests are run"""

    def _info(self):

        return evaluate.EvaluationModuleInfo(

            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,

            features=datasets.Features({
                'predictions': datasets.Sequence(datasets.Value("string")),
            }),
            homepage="https://github.com/hendrycks/apps",
            reference_urls=["https://huggingface.co/datasets/codeparrot/apps"]
        )



    def _compute(self, generations, k_list=[1, 10, 100], count_errors=True, level=["all"]):
        """Returns the scores"""
        metrics = compute_metrics(generations, k_list=k_list, count_errors=count_errors, level=level)
        return metrics