b_norm / b_norm.py
saridormi's picture
try to fix naming errors
b529fa6
raw
history blame
2.63 kB
from typing import Dict, List
import datasets
import evaluate
from .reused import bleuFromMaps, splitPuncts
_CITATION = """\
@inproceedings{tao2021evaluation,
title={On the Evaluation of Commit Message Generation Models: An Experimental Study},
author={Tao, Wei and Wang, Yanlin and Shi, Ensheng and Du, Lun and Han, Shi and Zhang, Hongyu and Zhang, Dongmei and Zhang, Wenqiang},
booktitle={2021 IEEE International Conference on Software Maintenance and Evolution (ICSME)},
pages={126--136},
year={2021},
organization={IEEE}
}
@inproceedings{Papineni02bleu:a,
author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu},
title = {BLEU: a Method for Automatic Evaluation of Machine Translation},
booktitle = {},
year = {2002},
pages = {311--318}
}
@inproceedings{lin-och-2004-orange,
title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation",
author = "Lin, Chin-Yew and
Och, Franz Josef",
booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics",
month = "aug 23{--}aug 27",
year = "2004",
address = "Geneva, Switzerland",
publisher = "COLING",
url = "https://www.aclweb.org/anthology/C04-1072",
pages = "501--507",
}
"""
_DESCRIPTION = """\
B-Norm is a variation of BLEU. It uses smoothing by Lin and Och, 2004 and does some additional preprocessing steps.
It was recommended for evaluation of commit message generation approaches in the
"On the Evaluation of Commit Message Generation Models: An Experimental Study" paper accepted to ICSME 2021.
This class uses implementation provided in the replication package.
"""
class BNorm(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
codebase_urls=["https://github.com/DeepSoftwareAnalytics/CommitMsgEmpirical/blob/main/metrics/B-Norm.py"],
)
def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]: # type: ignore[override]
prediction_map = {i: [splitPuncts(pred.strip().lower())] for i, pred in enumerate(predictions)}
gold_map = {i: [splitPuncts(ref.strip().lower())] for i, ref in enumerate(references)}
return {"b_norm": bleuFromMaps(gold_map, prediction_map)[0] / 100.0}