File size: 4,166 Bytes
f959a75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import unittest
from cer import CER

cer = CER()
class TestCER(unittest.TestCase):
    def test_cer_case_sensitive(self):
        refs = ["Magyar Országgyűlés"]
        preds = ["Magyar Országgyűlés"]
        # S = 2, D = 0, I = 0, N = 11, CER = 2 / 11
        char_error_rate = cer.compute(predictions=preds, references=refs)
        self.assertTrue(abs(char_error_rate - 0.1818181818) < 1e-6)
        
    def test_cer_whitespace(self):
        refs = ["Farkasok voltak"]
        preds = ["Farkasokvoltak"]
        # S = , D = , I = 1, N = , CER = I / N
        char_error_rate = cer.compute(predictions=preds, references=refs)
        self.assertTrue(abs(char_error_rate - 0.) < 1e-6)

        refs = ["Farkasokvoltak"]
        preds = ["Ferkasok     voltak"]
        # S = , D = 1, I = 0, N = 14, CER =  
        char_error_rate = cer.compute(predictions=preds, references=refs)
        self.assertTrue(abs(char_error_rate - 0.) < 1e-6)

        # consecutive whitespaces case 1
        refs = ["Farkasok voltak"]
        preds = ["Farkasok               voltak"]
        # S = 0, D = 0, I = 0, N = , CER = 0
        char_error_rate = cer.compute(predictions=preds, references=refs)
        self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)

        # consecutive whitespaces case 2
        refs = ["Farkasok   voltak"]
        preds = ["Farkasok               voltak"]
        # S = 0, D = 0, I = 0, N = ?, CER = 0
        char_error_rate = cer.compute(predictions=preds, references=refs)
        self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)

    def test_cer_sub(self):
        refs = ["Magyar"]
        preds = ["Megyar"]
        # S = 1, D = 0, I = 0, N = 6, CER = 0.125
        char_error_rate = cer.compute(predictions=preds, references=refs)
        self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)

    def test_cer_del(self):
        refs = ["Farkasokvoltak"]
        preds = ["Farkasokavoltak"]
        # S = 0, D = 1, I = 0, N = 14, CER = 0.
        char_error_rate = cer.compute(predictions=preds, references=refs)
        self.assertTrue(abs(char_error_rate - 0.) < 1e-6)

    def test_cer_insert(self):
        refs = ["Farkasokvoltak"]
        preds = ["Farkasokoltak"]
        # S = 0, D = 0, I = 1, N = 14, CER = 0.
        char_error_rate = cer.compute(predictions=preds, references=refs)
        self.assertTrue(abs(char_error_rate - 0.) < 1e-6)

    def test_cer_equal(self):
        refs = ["Magyar"]
        char_error_rate = cer.compute(predictions=refs, references=refs)
        self.assertEqual(char_error_rate, 0.0)

    def test_cer_list_of_seqs(self):
        # ['Eötvös Loránd University','I love my daughter']
        refs = ["Eötvös Loránd Tudományegyetem", "szeretem a lányom"] 
        char_error_rate = cer.compute(predictions=refs, references=refs)
        self.assertEqual(char_error_rate, 0.0)

        refs = ["diák", "Az arab nyelvet könnyű megtanulni!", "autó"]
        preds = ["dxák", "Az       arab nyelvet könnyű megtanulni!", "autó"]
        # S = 1, D = 0, I = 0, N = 28, CER = 1 / 42
        char_error_rate = cer.compute(predictions=preds, references=refs)
        self.assertTrue(abs(char_error_rate - 0.0238095238) < 1e-6)

    def test_correlated_sentences(self):
        # Learn artificial intelligence to secure your future
        # Tanuljon mesterséges intelligenciát, hogy biztosítsa jövőjét
        refs = ["Tanuljon mesterséges intelligenciát,", " hogy biztosítsa jövőjét"]
        preds = ["Tanuljon mesterséges intelligenciát, hogy", " biztosítsa jövőjét"]
        # S = 0, D = 0, I = 1, N = 28, CER = 2 / 60
        # whitespace at the front of " biztosítsa jövőjét" will be strip during preporcessing
        # so need to insert 2 whitespaces
        char_error_rate = cer.compute(predictions=preds, references=refs, concatenate_texts=True)
        self.assertTrue(abs(char_error_rate - 0.03333333333) < 1e-6)

    def test_cer_empty(self):
        refs = [""]
        preds = ["tök mindegy"]
        with self.assertRaises(ValueError):
            cer.compute(predictions=preds, references=refs)

if __name__ == "__main__":
    unittest.main()