rahulshah63 commited on
Commit
4f3ed2b
1 Parent(s): f0e608c

Upload 4 files

Browse files
Files changed (4) hide show
  1. text/cleaners.py +90 -0
  2. text/cmudict.py +65 -0
  3. text/numbers.py +71 -0
  4. text/symbols.py +18 -0
text/cleaners.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ '''
4
+ Cleaners are transformations that run over the input text at both training and eval time.
5
+
6
+ Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7
+ hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8
+ 1. "english_cleaners" for English text
9
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12
+ the symbols in symbols.py to match your data).
13
+ '''
14
+
15
+ import re
16
+ from unidecode import unidecode
17
+ from .numbers import normalize_numbers
18
+
19
+
20
+ # Regular expression matching whitespace:
21
+ _whitespace_re = re.compile(r'\s+')
22
+
23
+ # List of (regular expression, replacement) pairs for abbreviations:
24
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
25
+ ('mrs', 'misess'),
26
+ ('mr', 'mister'),
27
+ ('dr', 'doctor'),
28
+ ('st', 'saint'),
29
+ ('co', 'company'),
30
+ ('jr', 'junior'),
31
+ ('maj', 'major'),
32
+ ('gen', 'general'),
33
+ ('drs', 'doctors'),
34
+ ('rev', 'reverend'),
35
+ ('lt', 'lieutenant'),
36
+ ('hon', 'honorable'),
37
+ ('sgt', 'sergeant'),
38
+ ('capt', 'captain'),
39
+ ('esq', 'esquire'),
40
+ ('ltd', 'limited'),
41
+ ('col', 'colonel'),
42
+ ('ft', 'fort'),
43
+ ]]
44
+
45
+
46
+ def expand_abbreviations(text):
47
+ for regex, replacement in _abbreviations:
48
+ text = re.sub(regex, replacement, text)
49
+ return text
50
+
51
+
52
+ def expand_numbers(text):
53
+ return normalize_numbers(text)
54
+
55
+
56
+ def lowercase(text):
57
+ return text.lower()
58
+
59
+
60
+ def collapse_whitespace(text):
61
+ return re.sub(_whitespace_re, ' ', text)
62
+
63
+
64
+ def convert_to_ascii(text):
65
+ return unidecode(text)
66
+
67
+
68
+ def basic_cleaners(text):
69
+ '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
70
+ text = lowercase(text)
71
+ text = collapse_whitespace(text)
72
+ return text
73
+
74
+
75
+ def transliteration_cleaners(text):
76
+ '''Pipeline for non-English text that transliterates to ASCII.'''
77
+ text = convert_to_ascii(text)
78
+ text = lowercase(text)
79
+ text = collapse_whitespace(text)
80
+ return text
81
+
82
+
83
+ def english_cleaners(text):
84
+ '''Pipeline for English text, including number and abbreviation expansion.'''
85
+ text = convert_to_ascii(text)
86
+ text = lowercase(text)
87
+ text = expand_numbers(text)
88
+ text = expand_abbreviations(text)
89
+ text = collapse_whitespace(text)
90
+ return text
text/cmudict.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ import re
4
+
5
+
6
+ valid_symbols = [
7
+ 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
8
+ 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
9
+ 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
10
+ 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
11
+ 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
12
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
13
+ 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
14
+ ]
15
+
16
+ _valid_symbol_set = set(valid_symbols)
17
+
18
+
19
+ class CMUDict:
20
+ '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
21
+ def __init__(self, file_or_path, keep_ambiguous=True):
22
+ if isinstance(file_or_path, str):
23
+ with open(file_or_path, encoding='latin-1') as f:
24
+ entries = _parse_cmudict(f)
25
+ else:
26
+ entries = _parse_cmudict(file_or_path)
27
+ if not keep_ambiguous:
28
+ entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
29
+ self._entries = entries
30
+
31
+
32
+ def __len__(self):
33
+ return len(self._entries)
34
+
35
+
36
+ def lookup(self, word):
37
+ '''Returns list of ARPAbet pronunciations of the given word.'''
38
+ return self._entries.get(word.upper())
39
+
40
+
41
+
42
+ _alt_re = re.compile(r'\([0-9]+\)')
43
+
44
+
45
+ def _parse_cmudict(file):
46
+ cmudict = {}
47
+ for line in file:
48
+ if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
49
+ parts = line.split(' ')
50
+ word = re.sub(_alt_re, '', parts[0])
51
+ pronunciation = _get_pronunciation(parts[1])
52
+ if pronunciation:
53
+ if word in cmudict:
54
+ cmudict[word].append(pronunciation)
55
+ else:
56
+ cmudict[word] = [pronunciation]
57
+ return cmudict
58
+
59
+
60
+ def _get_pronunciation(s):
61
+ parts = s.strip().split(' ')
62
+ for part in parts:
63
+ if part not in _valid_symbol_set:
64
+ return None
65
+ return ' '.join(parts)
text/numbers.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ import inflect
4
+ import re
5
+
6
+
7
+ _inflect = inflect.engine()
8
+ _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
9
+ _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
10
+ _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
11
+ _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
12
+ _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
13
+ _number_re = re.compile(r'[0-9]+')
14
+
15
+
16
+ def _remove_commas(m):
17
+ return m.group(1).replace(',', '')
18
+
19
+
20
+ def _expand_decimal_point(m):
21
+ return m.group(1).replace('.', ' point ')
22
+
23
+
24
+ def _expand_dollars(m):
25
+ match = m.group(1)
26
+ parts = match.split('.')
27
+ if len(parts) > 2:
28
+ return match + ' dollars' # Unexpected format
29
+ dollars = int(parts[0]) if parts[0] else 0
30
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31
+ if dollars and cents:
32
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
33
+ cent_unit = 'cent' if cents == 1 else 'cents'
34
+ return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
35
+ elif dollars:
36
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
37
+ return '%s %s' % (dollars, dollar_unit)
38
+ elif cents:
39
+ cent_unit = 'cent' if cents == 1 else 'cents'
40
+ return '%s %s' % (cents, cent_unit)
41
+ else:
42
+ return 'zero dollars'
43
+
44
+
45
+ def _expand_ordinal(m):
46
+ return _inflect.number_to_words(m.group(0))
47
+
48
+
49
+ def _expand_number(m):
50
+ num = int(m.group(0))
51
+ if num > 1000 and num < 3000:
52
+ if num == 2000:
53
+ return 'two thousand'
54
+ elif num > 2000 and num < 2010:
55
+ return 'two thousand ' + _inflect.number_to_words(num % 100)
56
+ elif num % 100 == 0:
57
+ return _inflect.number_to_words(num // 100) + ' hundred'
58
+ else:
59
+ return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
60
+ else:
61
+ return _inflect.number_to_words(num, andword='')
62
+
63
+
64
+ def normalize_numbers(text):
65
+ text = re.sub(_comma_number_re, _remove_commas, text)
66
+ text = re.sub(_pounds_re, r'\1 pounds', text)
67
+ text = re.sub(_dollars_re, _expand_dollars, text)
68
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
70
+ text = re.sub(_number_re, _expand_number, text)
71
+ return text
text/symbols.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ '''
4
+ Defines the set of symbols used in text input to the model.
5
+
6
+ The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
7
+ from text import cmudict
8
+
9
+ _pad = '_'
10
+ _punctuation = '!\'(),.:;? '
11
+ _special = '-'
12
+ _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
13
+
14
+ # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
15
+ _arpabet = ['@' + s for s in cmudict.valid_symbols]
16
+
17
+ # Export all symbols:
18
+ symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet