|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Python implementation of BLEU and smooth-BLEU. |
|
|
|
This module provides a Python implementation of BLEU and smooth-BLEU. |
|
Smooth BLEU is computed following the method outlined in the paper: |
|
Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic |
|
evaluation metrics for machine translation. COLING 2004. |
|
""" |
|
|
|
import collections |
|
import math |
|
|
|
|
|
def _get_ngrams(segment, max_order): |
|
"""Extracts all n-grams upto a given maximum order from an input segment. |
|
|
|
Args: |
|
segment: text segment from which n-grams will be extracted. |
|
max_order: maximum length in tokens of the n-grams returned by this |
|
methods. |
|
|
|
Returns: |
|
The Counter containing all n-grams upto max_order in segment |
|
with a count of how many times each n-gram occurred. |
|
""" |
|
ngram_counts = collections.Counter() |
|
for order in range(1, max_order + 1): |
|
for i in range(0, len(segment) - order + 1): |
|
ngram = tuple(segment[i:i+order]) |
|
ngram_counts[ngram] += 1 |
|
return ngram_counts |
|
|
|
|
|
def compute_bleu(reference_corpus, translation_corpus, max_order=4, |
|
smooth=False): |
|
"""Computes BLEU score of translated segments against one or more references. |
|
|
|
Args: |
|
reference_corpus: list of lists of references for each translation. Each |
|
reference should be tokenized into a list of tokens. |
|
translation_corpus: list of translations to score. Each translation |
|
should be tokenized into a list of tokens. |
|
max_order: Maximum n-gram order to use when computing BLEU score. |
|
smooth: Whether or not to apply Lin et al. 2004 smoothing. |
|
|
|
Returns: |
|
3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram |
|
precisions and brevity penalty. |
|
""" |
|
matches_by_order = [0] * max_order |
|
possible_matches_by_order = [0] * max_order |
|
reference_length = 0 |
|
translation_length = 0 |
|
for (references, translation) in zip(reference_corpus, |
|
translation_corpus): |
|
reference_length += min(len(r) for r in references) |
|
translation_length += len(translation) |
|
|
|
merged_ref_ngram_counts = collections.Counter() |
|
for reference in references: |
|
merged_ref_ngram_counts |= _get_ngrams(reference, max_order) |
|
translation_ngram_counts = _get_ngrams(translation, max_order) |
|
overlap = translation_ngram_counts & merged_ref_ngram_counts |
|
for ngram in overlap: |
|
matches_by_order[len(ngram)-1] += overlap[ngram] |
|
for order in range(1, max_order+1): |
|
possible_matches = len(translation) - order + 1 |
|
if possible_matches > 0: |
|
possible_matches_by_order[order-1] += possible_matches |
|
|
|
precisions = [0] * max_order |
|
for i in range(0, max_order): |
|
if smooth: |
|
precisions[i] = ((matches_by_order[i] + 1.) / |
|
(possible_matches_by_order[i] + 1.)) |
|
else: |
|
if possible_matches_by_order[i] > 0: |
|
precisions[i] = (float(matches_by_order[i]) / |
|
possible_matches_by_order[i]) |
|
else: |
|
precisions[i] = 0.0 |
|
|
|
if min(precisions) > 0: |
|
p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) |
|
geo_mean = math.exp(p_log_sum) |
|
else: |
|
geo_mean = 0 |
|
|
|
ratio = float(translation_length) / reference_length |
|
|
|
if ratio > 1.0: |
|
bp = 1. |
|
else: |
|
bp = math.exp(1 - 1. / ratio) |
|
|
|
bleu = geo_mean * bp |
|
|
|
return (bleu, precisions, bp, ratio, translation_length, reference_length) |
|
|
|
|
|
def _bleu(ref_file, trans_file, subword_option=None): |
|
max_order = 4 |
|
smooth = True |
|
ref_files = [ref_file] |
|
reference_text = [] |
|
for reference_filename in ref_files: |
|
with open(reference_filename) as fh: |
|
reference_text.append(fh.readlines()) |
|
per_segment_references = [] |
|
for references in zip(*reference_text): |
|
reference_list = [] |
|
for reference in references: |
|
reference_list.append(reference.strip().split()) |
|
per_segment_references.append(reference_list) |
|
translations = [] |
|
with open(trans_file) as fh: |
|
for line in fh: |
|
translations.append(line.strip().split()) |
|
bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth) |
|
return round(100 * bleu_score,2) |