saridormi commited on
Commit
6de9c64
1 Parent(s): bac73fa

initial commit

Browse files
Files changed (4) hide show
  1. app.py +6 -0
  2. b_norm.py +63 -0
  3. requirements.txt +1 -0
  4. reused.py +221 -0
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("saridormi/b_norm")
6
+ launch_gradio_widget(module)
b_norm.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ import datasets
4
+ import evaluate
5
+
6
+ from .reused import bleuFromMaps, splitPuncts
7
+
8
+ _CITATION = """\
9
+ @inproceedings{tao2021evaluation,
10
+ title={On the Evaluation of Commit Message Generation Models: An Experimental Study},
11
+ author={Tao, Wei and Wang, Yanlin and Shi, Ensheng and Du, Lun and Han, Shi and Zhang, Hongyu and Zhang, Dongmei and Zhang, Wenqiang},
12
+ booktitle={2021 IEEE International Conference on Software Maintenance and Evolution (ICSME)},
13
+ pages={126--136},
14
+ year={2021},
15
+ organization={IEEE}
16
+ }
17
+ @inproceedings{Papineni02bleu:a,
18
+ author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu},
19
+ title = {BLEU: a Method for Automatic Evaluation of Machine Translation},
20
+ booktitle = {},
21
+ year = {2002},
22
+ pages = {311--318}
23
+ }
24
+ @inproceedings{lin-och-2004-orange,
25
+ title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation",
26
+ author = "Lin, Chin-Yew and
27
+ Och, Franz Josef",
28
+ booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics",
29
+ month = "aug 23{--}aug 27",
30
+ year = "2004",
31
+ address = "Geneva, Switzerland",
32
+ publisher = "COLING",
33
+ url = "https://www.aclweb.org/anthology/C04-1072",
34
+ pages = "501--507",
35
+ }
36
+ """
37
+
38
+ _DESCRIPTION = """\
39
+ B-Norm is a variation of BLEU. It uses smoothing by Lin and Och, 2004 and does some additional preprocessing steps.
40
+ It was recommended for evaluation of commit message generation approaches in the
41
+ "On the Evaluation of Commit Message Generation Models: An Experimental Study" paper accepted to ICSME 2021.
42
+ This class uses implementation provided in the replication package.
43
+ """
44
+
45
+
46
+ class BLEUNorm(evaluate.Metric):
47
+ def _info(self):
48
+ return evaluate.MetricInfo(
49
+ description=_DESCRIPTION,
50
+ citation=_CITATION,
51
+ features=datasets.Features(
52
+ {
53
+ "predictions": datasets.Value("string", id="sequence"),
54
+ "references": datasets.Value("string", id="sequence"),
55
+ }
56
+ ),
57
+ codebase_urls=["https://github.com/DeepSoftwareAnalytics/CommitMsgEmpirical/blob/main/metrics/B-Norm.py"],
58
+ )
59
+
60
+ def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]: # type: ignore[override]
61
+ prediction_map = {i: [splitPuncts(pred.strip().lower())] for i, pred in enumerate(predictions)}
62
+ gold_map = {i: [splitPuncts(ref.strip().lower())] for i, ref in enumerate(references)}
63
+ return {"b_norm": bleuFromMaps(gold_map, prediction_map)[0] / 100.0}
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ evaluate
reused.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script is copied from https://github.com/DeepSoftwareAnalytics/CommitMsgEmpirical,
3
+ the replication package for "On the Evaluation of Commit Message Generation Models: An Experimental Study"
4
+ accepted to ICSME 2021.
5
+ """
6
+
7
+ #!/usr/bin/python
8
+
9
+ """
10
+ This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
11
+ """
12
+
13
+ # $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
14
+
15
+ """Provides:
16
+ cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
17
+ cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
18
+ score_cooked(alltest, n=4): Score a list of cooked test sentences.
19
+ score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
20
+ The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
21
+ """
22
+
23
+ import math
24
+ import os
25
+ import re
26
+ import subprocess
27
+ import sys
28
+ import xml.sax.saxutils
29
+
30
+ # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
31
+ nonorm = 0
32
+
33
+ preserve_case = False
34
+ eff_ref_len = "shortest"
35
+
36
+ normalize1 = [
37
+ ("<skipped>", ""), # strip "skipped" tags
38
+ (r"-\n", ""), # strip end-of-line hyphenation and join lines
39
+ (r"\n", " "), # join lines
40
+ # (r'(\d)\s+(?=\d)', r'\1'), # join digits
41
+ ]
42
+ normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
43
+
44
+ normalize2 = [
45
+ (r"([\{-\~\[-\` -\&\(-\+\:-\@\/])", r" \1 "), # tokenize punctuation. apostrophe is missing
46
+ (r"([^0-9])([\.,])", r"\1 \2 "), # tokenize period and comma unless preceded by a digit
47
+ (r"([\.,])([^0-9])", r" \1 \2"), # tokenize period and comma unless followed by a digit
48
+ (r"([0-9])(-)", r"\1 \2 "), # tokenize dash when preceded by a digit
49
+ ]
50
+ normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
51
+
52
+
53
+ def normalize(s):
54
+ """Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl."""
55
+ # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
56
+ if nonorm:
57
+ return s.split()
58
+ if type(s) is not str:
59
+ s = " ".join(s)
60
+ # language-independent part:
61
+ for (pattern, replace) in normalize1:
62
+ s = re.sub(pattern, replace, s)
63
+ s = xml.sax.saxutils.unescape(s, {"&quot;": '"'})
64
+ # language-dependent part (assuming Western languages):
65
+ s = " %s " % s
66
+ if not preserve_case:
67
+ s = s.lower() # this might not be identical to the original
68
+ for (pattern, replace) in normalize2:
69
+ s = re.sub(pattern, replace, s)
70
+ return s.split()
71
+
72
+
73
+ def count_ngrams(words, n=4):
74
+ counts = {}
75
+ for k in range(1, n + 1):
76
+ for i in range(len(words) - k + 1):
77
+ ngram = tuple(words[i : i + k])
78
+ counts[ngram] = counts.get(ngram, 0) + 1
79
+ return counts
80
+
81
+
82
+ def cook_refs(refs, n=4):
83
+ """Takes a list of reference sentences for a single segment
84
+ and returns an object that encapsulates everything that BLEU
85
+ needs to know about them."""
86
+
87
+ refs = [normalize(ref) for ref in refs]
88
+ maxcounts = {}
89
+ for ref in refs:
90
+ counts = count_ngrams(ref, n)
91
+ for (ngram, count) in counts.items():
92
+ maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
93
+ return ([len(ref) for ref in refs], maxcounts)
94
+
95
+
96
+ def cook_test(test, item, n=4):
97
+ """Takes a test sentence and returns an object that
98
+ encapsulates everything that BLEU needs to know about it."""
99
+ (reflens, refmaxcounts) = item
100
+ test = normalize(test)
101
+ result = {}
102
+ result["testlen"] = len(test)
103
+
104
+ # Calculate effective reference sentence length.
105
+
106
+ if eff_ref_len == "shortest":
107
+ result["reflen"] = min(reflens)
108
+ elif eff_ref_len == "average":
109
+ result["reflen"] = float(sum(reflens)) / len(reflens)
110
+ elif eff_ref_len == "closest":
111
+ min_diff = None
112
+ for reflen in reflens:
113
+ if min_diff is None or abs(reflen - len(test)) < min_diff:
114
+ min_diff = abs(reflen - len(test))
115
+ result["reflen"] = reflen
116
+
117
+ result["guess"] = [max(len(test) - k + 1, 0) for k in range(1, n + 1)]
118
+
119
+ result["correct"] = [0] * n
120
+ counts = count_ngrams(test, n)
121
+ for (ngram, count) in counts.items():
122
+ result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
123
+
124
+ return result
125
+
126
+
127
+ def score_cooked(allcomps, n=4, ground=0, smooth=1):
128
+ totalcomps = {"testlen": 0, "reflen": 0, "guess": [0] * n, "correct": [0] * n}
129
+ for comps in allcomps:
130
+ for key in ["testlen", "reflen"]:
131
+ totalcomps[key] += comps[key]
132
+ for key in ["guess", "correct"]:
133
+ for k in range(n):
134
+ totalcomps[key][k] += comps[key][k]
135
+ logbleu = 0.0
136
+ all_bleus = []
137
+ for k in range(n):
138
+ correct = totalcomps["correct"][k]
139
+ guess = totalcomps["guess"][k]
140
+ addsmooth = 0
141
+ if smooth == 1 and k > 0:
142
+ addsmooth = 1
143
+ logbleu += math.log(correct + addsmooth + sys.float_info.min) - math.log(guess + addsmooth + sys.float_info.min)
144
+ if guess == 0:
145
+ all_bleus.append(-10000000)
146
+ else:
147
+ all_bleus.append(math.log(correct + sys.float_info.min) - math.log(guess))
148
+
149
+ logbleu /= float(n)
150
+ all_bleus.insert(0, logbleu)
151
+
152
+ brevPenalty = min(0, 1 - float(totalcomps["reflen"] + 1) / (totalcomps["testlen"] + 1))
153
+ for i in range(len(all_bleus)):
154
+ if i == 0:
155
+ all_bleus[i] += brevPenalty
156
+ all_bleus[i] = math.exp(all_bleus[i])
157
+ return all_bleus
158
+
159
+
160
+ def bleu(refs, candidate, ground=0, smooth=1):
161
+ refs = cook_refs(refs)
162
+ test = cook_test(candidate, refs)
163
+ return score_cooked([test], ground=ground, smooth=smooth)
164
+
165
+
166
+ def splitPuncts(line):
167
+ return " ".join(re.findall(r"[\w]+|[^\s\w]", line))
168
+
169
+
170
+ def computeMaps(predictions, goldfile):
171
+ predictionMap = {}
172
+ goldMap = {}
173
+ gf = open(goldfile, "r")
174
+
175
+ for row in predictions:
176
+ cols = row.strip().split("\t")
177
+ if len(cols) == 1:
178
+ (rid, pred) = (cols[0], "")
179
+ else:
180
+ (rid, pred) = (cols[0], cols[1])
181
+ predictionMap[rid] = [splitPuncts(pred.strip().lower())]
182
+
183
+ for row in gf:
184
+ (rid, pred) = row.split("\t")
185
+ if rid in predictionMap: # Only insert if the id exists for the method
186
+ if rid not in goldMap:
187
+ goldMap[rid] = []
188
+ goldMap[rid].append(splitPuncts(pred.strip().lower()))
189
+
190
+ return (goldMap, predictionMap)
191
+
192
+
193
+ # m1 is the reference map
194
+ # m2 is the prediction map
195
+ def bleuFromMaps(m1, m2):
196
+ score = [0] * 5
197
+ num = 0.0
198
+
199
+ for key in m1:
200
+ if key in m2:
201
+ bl = bleu(m1[key], m2[key][0])
202
+ score = [score[i] + bl[i] for i in range(0, len(bl))]
203
+ num += 1
204
+ return [s * 100.0 / num for s in score]
205
+
206
+
207
+ if __name__ == "__main__":
208
+ ref_sentence_lst = open(sys.argv[1]).read().split("\n")
209
+ with open("tmp_ref.txt", "w") as f:
210
+ for idx, ref_sentence in enumerate(ref_sentence_lst):
211
+ f.write("{}\t{}\n".format(idx, ref_sentence))
212
+
213
+ reference_file = "tmp_ref.txt"
214
+ predictions = []
215
+ for idx, row in enumerate(sys.stdin):
216
+ predictions.append("{}\t{}".format(idx, row))
217
+ if len(predictions) == len(ref_sentence_lst) - 1:
218
+ predictions.append("{}\t{}".format(len(predictions), ""))
219
+ (goldMap, predictionMap) = computeMaps(predictions, reference_file)
220
+ print(bleuFromMaps(goldMap, predictionMap)[0])
221
+ os.remove("tmp_ref.txt")