from typing import Dict, List import datasets import evaluate from .reused import bleuFromMaps, splitPuncts _CITATION = """\ @inproceedings{tao2021evaluation, title={On the Evaluation of Commit Message Generation Models: An Experimental Study}, author={Tao, Wei and Wang, Yanlin and Shi, Ensheng and Du, Lun and Han, Shi and Zhang, Hongyu and Zhang, Dongmei and Zhang, Wenqiang}, booktitle={2021 IEEE International Conference on Software Maintenance and Evolution (ICSME)}, pages={126--136}, year={2021}, organization={IEEE} } @inproceedings{Papineni02bleu:a, author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu}, title = {BLEU: a Method for Automatic Evaluation of Machine Translation}, booktitle = {}, year = {2002}, pages = {311--318} } @inproceedings{lin-och-2004-orange, title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation", author = "Lin, Chin-Yew and Och, Franz Josef", booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics", month = "aug 23{--}aug 27", year = "2004", address = "Geneva, Switzerland", publisher = "COLING", url = "https://www.aclweb.org/anthology/C04-1072", pages = "501--507", } """ _DESCRIPTION = """\ B-Norm is a variation of BLEU. It uses smoothing by Lin and Och, 2004 and does some additional preprocessing steps. It was recommended for evaluation of commit message generation approaches in the "On the Evaluation of Commit Message Generation Models: An Experimental Study" paper accepted to ICSME 2021. This class uses implementation provided in the replication package. """ class BNorm(evaluate.Metric): def _info(self): return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, features=datasets.Features( { "predictions": datasets.Value("string", id="sequence"), "references": datasets.Value("string", id="sequence"), } ), codebase_urls=["https://github.com/DeepSoftwareAnalytics/CommitMsgEmpirical/blob/main/metrics/B-Norm.py"], ) def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]: # type: ignore[override] prediction_map = {i: [splitPuncts(pred.strip().lower())] for i, pred in enumerate(predictions)} gold_map = {i: [splitPuncts(ref.strip().lower())] for i, ref in enumerate(references)} return {"b_norm": bleuFromMaps(gold_map, prediction_map)[0] / 100.0}