Pendrokar commited on
Commit
7f54c68
β€’
1 Parent(s): 7decaa4

requirements: git h2parser

Browse files
Files changed (29) hide show
  1. requirements.txt +1 -0
  2. resources/app/python/xvapitch/text/h2p_parser/__init__.py +0 -22
  3. resources/app/python/xvapitch/text/h2p_parser/__main__.py +0 -185
  4. resources/app/python/xvapitch/text/h2p_parser/cmudictext.py +0 -253
  5. resources/app/python/xvapitch/text/h2p_parser/compat/__init__.py +0 -7
  6. resources/app/python/xvapitch/text/h2p_parser/compat/cmudict.py +0 -19
  7. resources/app/python/xvapitch/text/h2p_parser/data/__init__.py +0 -0
  8. resources/app/python/xvapitch/text/h2p_parser/data/cmudict-0.7b.txt +0 -0
  9. resources/app/python/xvapitch/text/h2p_parser/data/cmudict.dict +0 -0
  10. resources/app/python/xvapitch/text/h2p_parser/data/dict.json +0 -1500
  11. resources/app/python/xvapitch/text/h2p_parser/data/example.json +0 -16
  12. resources/app/python/xvapitch/text/h2p_parser/dict_reader.py +0 -109
  13. resources/app/python/xvapitch/text/h2p_parser/dictionary.py +0 -85
  14. resources/app/python/xvapitch/text/h2p_parser/filter.py +0 -34
  15. resources/app/python/xvapitch/text/h2p_parser/format_ph.py +0 -99
  16. resources/app/python/xvapitch/text/h2p_parser/h2p.py +0 -123
  17. resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/PKG-INFO +0 -14
  18. resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/SOURCES.txt +0 -19
  19. resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/dependency_links.txt +0 -1
  20. resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/requires.txt +0 -2
  21. resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/top_level.txt +0 -1
  22. resources/app/python/xvapitch/text/h2p_parser/pos_parser.py +0 -17
  23. resources/app/python/xvapitch/text/h2p_parser/processors.py +0 -392
  24. resources/app/python/xvapitch/text/h2p_parser/symbols.py +0 -82
  25. resources/app/python/xvapitch/text/h2p_parser/text/__init__.py +0 -0
  26. resources/app/python/xvapitch/text/h2p_parser/text/numbers.py +0 -166
  27. resources/app/python/xvapitch/text/h2p_parser/utils/__init__.py +0 -0
  28. resources/app/python/xvapitch/text/h2p_parser/utils/converter.py +0 -79
  29. resources/app/python/xvapitch/text/h2p_parser/utils/parser.py +0 -133
requirements.txt CHANGED
@@ -73,3 +73,4 @@ webrtcvad==2.0.10
73
  wheel==0.36.2
74
  wrapt==1.14.1
75
  zipp==3.4.0
 
 
73
  wheel==0.36.2
74
  wrapt==1.14.1
75
  zipp==3.4.0
76
+ git+https://github.com/ionite34/h2p-parser
resources/app/python/xvapitch/text/h2p_parser/__init__.py DELETED
@@ -1,22 +0,0 @@
1
- """
2
- h2p_parser
3
-
4
- Heteronym to Phoneme Parser
5
-
6
- """
7
-
8
- import sys
9
-
10
- if sys.version_info < (3, 9):
11
- # In Python versions below 3.9, this is needed
12
- from importlib_resources import files
13
- else:
14
- # Since python 3.9+, importlib.resources.files is built-in
15
- from importlib.resources import files
16
-
17
- __version__ = "1.0.0"
18
-
19
- # Data module
20
- DATA_PATH = files(__name__ + '.data')
21
- # Iterable collection of all files in data.
22
- DATA_FILES = DATA_PATH.iterdir()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/__main__.py DELETED
@@ -1,185 +0,0 @@
1
- from collections import Counter
2
-
3
- from InquirerPy import inquirer
4
- from InquirerPy.utils import patched_print, color_print
5
- from InquirerPy.base.control import Choice
6
- from InquirerPy.validator import PathValidator
7
- from h2p_parser.utils import converter
8
- from h2p_parser.utils import parser
9
-
10
-
11
- def convert_h2p(input_file, output_file, delimiter):
12
- """
13
- Converts a h2p dictionary file from one format to another.
14
- """
15
- converter.bin_delim_to_json(input_file, output_file, delimiter)
16
- print('Converted h2p_dict to json.')
17
-
18
-
19
- def prompt_action() -> str:
20
- action = inquirer.select(
21
- message='Select action:',
22
- choices=[
23
- "Convert",
24
- "Parse",
25
- Choice(value=None, name='Exit')
26
- ],
27
- default=0,
28
- ).execute()
29
- if not action:
30
- exit(0)
31
- return action
32
-
33
-
34
- def prompt_f_input():
35
- """
36
- Prompts for input file.
37
- """
38
- return inquirer.filepath(
39
- message='Select input file:',
40
- validate=PathValidator(is_file=True, message='Input must be a file.')
41
- ).execute()
42
-
43
-
44
- def prompt_f_output():
45
- """
46
- Prompts for output file.
47
- """
48
- return inquirer.filepath(
49
- message='Select output file:',
50
- validate=PathValidator(is_file=True, message='Output must be a file.')
51
- ).execute()
52
-
53
-
54
- def action_convert():
55
- """
56
- Converts a h2p dictionary file from one format to another.
57
- """
58
- # Select input file
59
- input_file = prompt_f_input()
60
- if not input_file:
61
- return
62
-
63
- # Select output file
64
- output_file = prompt_f_output()
65
- if not output_file:
66
- return
67
-
68
- # Ask for delimiter
69
- delimiter = inquirer.text(
70
- message='Enter delimiter:',
71
- default='|'
72
- ).execute()
73
- if not delimiter:
74
- return
75
-
76
- # Run Process
77
- convert_h2p(input_file, output_file, delimiter)
78
-
79
-
80
- def action_parse_file():
81
- """
82
- Parses a metadata.csv file and checks for dictionary coverage
83
- :return:
84
- """
85
- # Select input file
86
- input_file = prompt_f_input()
87
- if not input_file:
88
- return
89
-
90
- # Ask for delimiter
91
- delimiter = inquirer.text(
92
- message='Enter delimiter:',
93
- default='|'
94
- ).execute()
95
- if not delimiter:
96
- return
97
-
98
- # Run Process
99
- result = parser.check_lines(parser.read_file(input_file, delimiter))
100
-
101
- # Print results
102
- color_print([("#e5c07b", "Unresolved Words")])
103
- color_print([("#d21205", "[All]: "),
104
- ("#ffffff", f"{len(result.unres_all_words)}/{len(result.all_words)}")])
105
- color_print([("#7e3b41", "[Unique]: "),
106
- ("#ffffff", f"{len(result.unres_words)}/{len(result.words)}")])
107
-
108
- color_print([("#4ce5c8", "-" * 10)])
109
-
110
- color_print([("#e5c07b", "Unresolved Lines")])
111
- color_print([("#d21205", "[All]: "),
112
- ("#ffffff", f"{len(result.unres_all_lines)}/{len(result.all_lines)}")])
113
- color_print([("#7e3b41", "[Unique]: "),
114
- ("#ffffff", f"{len(result.unres_lines)}/{len(result.lines)}")])
115
-
116
- color_print([("#4ce5c8", "-" * 10)])
117
-
118
- color_print([("#e5c07b", "Expected Coverage")])
119
- color_print([("#d21205", "[Lines]: "),
120
- ("#ffffff", f"{result.line_coverage()}%")])
121
- color_print([("#7e3b41", "[Words]: "),
122
- ("#ffffff", f"{result.word_coverage()}%")])
123
-
124
- color_print([("#4ce5c8", "-" * 10)])
125
-
126
- color_print([("#e5c07b", "H2p parser")])
127
- color_print([("#d21205", "[Lines with Heteronyms]: "),
128
- ("#ffffff", f"{len(result.all_lines_cont_het)}/{len(result.all_lines)}"
129
- f" | {result.percent_line_het()}%")])
130
- color_print([("#7e3b41", "[Words Resolved by H2p]: "),
131
- ("#ffffff", f"{result.n_words_het}/{result.n_words_res}"
132
- f" | {result.percent_word_h2p()}%")])
133
- # Calcs
134
- feature_res = result.n_words_fet
135
- feature_percent = round(feature_res / result.n_words_res * 100, 2)
136
- cmu_res = result.n_words_cmu
137
- cmu_percent = round(cmu_res / result.n_words_res * 100, 2)
138
- color_print([("#c8bd20", "[Transformed Resolves]: "),
139
- ("#ffffff", f"{feature_res}/{result.n_words_res}"
140
- f" | {feature_percent}%")])
141
- color_print([("#25a0c8", "[Words in CMUDict]: "),
142
- ("#ffffff", f"{cmu_res}/{result.n_words_res}"
143
- f" | {cmu_percent}%")])
144
-
145
- color_print([("#4ce5c8", "-" * 10)])
146
-
147
- color_print([("#e5c07b", "Feature Usage")])
148
-
149
- # Loop through feature results
150
- for ft in result.ft_stats:
151
- color_print([("#d21205", f"{ft}: "),
152
- ("#ffffff", f"{result.ft_stats[ft]}/{result.n_words_res}"
153
- f" | {round(result.ft_stats[ft]/result.n_words_res*100, 2)}%")])
154
-
155
- color_print([("#4ce5c8", "-" * 10)])
156
-
157
- # Print 100 sampled unresolved words by frequency
158
- color_print([("#e5c07b", "Top 100 most frequent unresolved words")])
159
- # Count frequency of words
160
- word_freq = Counter(result.unres_all_words)
161
- # Sort by frequency
162
- word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
163
- # Print top 100
164
- for word, freq in word_freq[:100]:
165
- color_print([("#d21205", f"{word}: "),
166
- ("#ffffff", f"{freq}")])
167
-
168
-
169
- def entry():
170
- """
171
- Prints help information.
172
- """
173
- # Select action type
174
- action = prompt_action()
175
- if action == 'Convert':
176
- action_convert()
177
- elif action == 'Parse':
178
- action_parse_file()
179
-
180
-
181
- if __name__ == "__main__":
182
- entry()
183
-
184
-
185
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/cmudictext.py DELETED
@@ -1,253 +0,0 @@
1
- # Extended Grapheme to Phoneme conversion using CMU Dictionary and Heteronym parsing.
2
- from __future__ import annotations
3
-
4
- import re
5
- from typing import Optional
6
-
7
- import pywordsegment
8
- import nltk
9
- from nltk.stem import WordNetLemmatizer
10
- from nltk.stem.snowball import SnowballStemmer
11
- from .h2p import H2p
12
- from .h2p import replace_first
13
- from . import format_ph as ph
14
- from .dict_reader import DictReader
15
- from .text.numbers import normalize_numbers
16
- from .filter import filter_text
17
- from .processors import Processor
18
- from copy import deepcopy
19
-
20
- re_digit = re.compile(r"\((\d+)\)")
21
- re_bracket_with_digit = re.compile(r"\(.*\)")
22
-
23
- # Check that the nltk data is downloaded, if not, download it
24
- try:
25
- nltk.data.find('corpora/wordnet.zip')
26
- nltk.data.find('corpora/omw-1.4.zip')
27
- except LookupError:
28
- nltk.download('wordnet')
29
- nltk.download('omw-1.4')
30
-
31
-
32
- class CMUDictExt:
33
- def __init__(self, cmu_dict_path: str = None, h2p_dict_path: str = None, cmu_multi_mode: int = 0,
34
- process_numbers: bool = True, phoneme_brackets: bool = True, unresolved_mode: str = 'keep'):
35
- # noinspection GrazieInspection
36
- """
37
- Initialize CMUDictExt - Extended Grapheme to Phoneme conversion using CMU Dictionary with Heteronym parsing.
38
-
39
- CMU multi-entry resolution modes:
40
- - -2 : Raw entry (i.e. 'A' resolves to 'AH0' and 'A(1)' to 'EY1')
41
- - -1 : Skip resolving any entry with multiple pronunciations.
42
- - 0 : Resolve using default un-numbered pronunciation.
43
- - 1 : Resolve using (1) numbered pronunciation.
44
- - n : Resolve using (n) numbered pronunciation.
45
- - If a higher number is specified than available for the word, the highest available number is used.
46
-
47
- Unresolved word resolution modes:
48
- - keep : Keep the text-form word in the output.
49
- - remove : Remove the text-form word from the output.
50
- - drop : Return the line as None if any word is unresolved.
51
-
52
- :param cmu_dict_path: Path to CMU dictionary file (.txt)
53
- :type: str
54
- :param h2p_dict_path: Path to Custom H2p dictionary (.json)
55
- :type: str
56
- :param cmu_multi_mode: CMU resolution mode for entries with multiple pronunciations.
57
- :type: int
58
- """
59
-
60
- # Check valid unresolved_mode argument
61
- if unresolved_mode not in ['keep', 'remove', 'drop']:
62
- raise ValueError('Invalid value for unresolved_mode: {}'.format(unresolved_mode))
63
- self.unresolved_mode = unresolved_mode
64
-
65
- self.cmu_dict_path = cmu_dict_path # Path to CMU dictionary file (.txt), if None, uses built-in
66
- self.h2p_dict_path = h2p_dict_path # Path to Custom H2p dictionary (.json), if None, uses built-in
67
- self.cmu_multi_mode = cmu_multi_mode # CMU multi-entry resolution mode
68
- self.process_numbers = process_numbers # Normalize numbers to text form, if enabled
69
- self.phoneme_brackets = phoneme_brackets # If True, phonemes are wrapped in curly brackets.
70
- self.dict = DictReader(self.cmu_dict_path).dict # CMU Dictionary
71
- self.h2p = H2p(self.h2p_dict_path, preload=True) # H2p parser
72
- self.lemmatize = WordNetLemmatizer().lemmatize # WordNet Lemmatizer - used to find singular form
73
- self.stem = SnowballStemmer('english').stem # Snowball Stemmer - used to find stem root of words
74
- self.segment = pywordsegment.WordSegmenter().segment # Word Segmenter
75
- self.p = Processor(self) # Processor for processing text
76
-
77
- # Features
78
- # Auto pluralization and de-pluralization
79
- self.ft_auto_plural = True
80
- # Auto splits and infers possessive forms of original words
81
- self.ft_auto_pos = True
82
- # Auto splits 'll
83
- self.ft_auto_ll = True
84
- # Auto splits and infers hyphenated words
85
- self.ft_auto_hyphenated = True
86
- # Auto splits possible compound words
87
- self.ft_auto_compound = True
88
- # Analyzes word root stem and infers pronunciation separately
89
- # i.e. 'generously' -> 'generous' + 'ly'
90
- self.ft_stem = True
91
- # Forces compound words using manual lookup
92
- self.ft_auto_compound_l2 = True
93
-
94
- def lookup(self, text: str, pos: str = None, ph_format: str = 'sds') -> str | list | None:
95
- # noinspection GrazieInspection
96
- """
97
- Gets the CMU Dictionary entry for a word.
98
-
99
- Options for ph_format:
100
-
101
- - 'sds' space delimited string
102
- - 'sds_b' space delimited string with curly brackets
103
- - 'list' list of phoneme strings
104
-
105
- :param pos: Part of speech tag (Optional)
106
- :param ph_format: Format of the phonemes to return:
107
- :type: str
108
- :param text: Word to lookup
109
- :type: str
110
- """
111
-
112
- def format_as(in_phoneme):
113
- if ph_format == 'sds':
114
- output = ph.to_sds(in_phoneme)
115
- elif ph_format == 'sds_b':
116
- output = ph.with_cb(ph.to_sds(in_phoneme))
117
- elif ph_format == 'list':
118
- output = ph.to_list(in_phoneme)
119
- else:
120
- raise ValueError('Invalid value for ph_format: {}'.format(ph_format))
121
- return output
122
-
123
- # Get the CMU Dictionary entry for the word
124
- word = text.lower()
125
- entry = deepcopy(self.dict.get(word)) # Ensure safe copy of entry
126
-
127
- # Has entry, return it directly
128
- if entry is not None:
129
- return format_as(entry)
130
-
131
- # Auto Possessive Processor
132
- if self.ft_auto_pos:
133
- res = self.p.auto_possessives(word)
134
- if res is not None:
135
- return format_as(res)
136
-
137
- # Auto Contractions for "ll" or "d"
138
- if self.ft_auto_ll:
139
- res = self.p.auto_contractions(word)
140
- if res is not None:
141
- return format_as(res)
142
-
143
- # Check for hyphenated words
144
- if self.ft_auto_hyphenated:
145
- res = self.p.auto_hyphenated(word)
146
- if res is not None:
147
- return format_as(res)
148
-
149
- # Check for compound words
150
- if self.ft_auto_compound:
151
- res = self.p.auto_compound(word)
152
- if res is not None:
153
- return format_as(res)
154
-
155
- # No entry, detect if this is a multi-word entry
156
- if '(' in word and ')' in word and any(char.isdigit() for char in word):
157
- # Parse the integer from the word using regex
158
- num = int(re.findall(re_digit, word)[0])
159
- # If found
160
- if num is not None:
161
- # Remove the integer and bracket from the word
162
- actual_word = re.sub(re_bracket_with_digit, "", word)
163
- # See if this is a valid entry
164
- result = deepcopy(self.dict.get(actual_word)) # Ensure safe copy of entry
165
- # If found:
166
- if result is not None:
167
- # Translate the integer to index
168
- index = min(num - 1, 0)
169
- # Check if index is less than the number of pronunciations
170
- if index < len(result):
171
- # Return the entry using the provided num index
172
- return format_as(result[index])
173
- # If entry is higher
174
- else:
175
- # Return the highest available entry
176
- return format_as(result[-1])
177
-
178
- # Auto de-pluralization
179
- # This is placed near the end because we need to do a pos-tag process
180
- if self.ft_auto_plural:
181
- res = self.p.auto_plural(word, pos)
182
- if res is not None:
183
- return format_as(res)
184
-
185
- # Stem check
186
- # noinspection SpellCheckingInspection
187
- """
188
- Supported modes for words ending in:
189
- "ing", "ingly", "ly"
190
- """
191
- if self.ft_stem:
192
- res = self.p.auto_stem(word)
193
- if res is not None:
194
- return format_as(res)
195
-
196
- # Force compounding
197
- if self.ft_auto_compound_l2:
198
- res = self.p.auto_compound_l2(word)
199
- if res is not None:
200
- return format_as(res)
201
-
202
- # If not found
203
- return None
204
-
205
- def convert(self, text: str) -> str | None:
206
- # noinspection GrazieInspection
207
- """
208
- Replace a grapheme text line with phonemes.
209
-
210
- :param text: Text line to be converted
211
- :type: str
212
- """
213
-
214
- # Check valid unresolved_mode argument
215
- if self.unresolved_mode not in ['keep', 'remove', 'drop']:
216
- raise ValueError('Invalid value for unresolved_mode: {}'.format(self.unresolved_mode))
217
- ur_mode = self.unresolved_mode
218
-
219
- # Normalize numbers, if enabled
220
- if self.process_numbers:
221
- text = normalize_numbers(text)
222
- # Filter and Tokenize
223
- f_text = filter_text(text, preserve_case=True)
224
- words = self.h2p.tokenize(f_text)
225
- # Run POS tagging
226
- tags = self.h2p.get_tags(words)
227
-
228
- # Loop through words and pos tags
229
- for word, pos in tags:
230
- # Skip punctuation
231
- if word == '.':
232
- continue
233
- # If word not in h2p dict, check CMU dict
234
- if not self.h2p.dict.contains(word):
235
- entry = self.lookup(word, pos)
236
- if entry is None:
237
- if ur_mode == 'drop':
238
- return None
239
- if ur_mode == 'remove':
240
- text = replace_first(word, '', text)
241
- continue
242
- # Do replace
243
- f_ph = ph.with_cb(ph.to_sds(entry))
244
- text = replace_first(word, f_ph, text)
245
- continue
246
- # For word in h2p dict, get phonemes
247
- phonemes = self.h2p.dict.get_phoneme(word, pos)
248
- # Format phonemes
249
- f_ph = ph.with_cb(ph.to_sds(phonemes))
250
- # Replace word with phonemes
251
- text = replace_first(word, f_ph, text)
252
- # Return text
253
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/compat/__init__.py DELETED
@@ -1,7 +0,0 @@
1
- """
2
- Compatibility module.
3
-
4
- This module contains compatibility wrappers for existing
5
- implementations of CMUDict and other dictionaries.
6
-
7
- """
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/compat/cmudict.py DELETED
@@ -1,19 +0,0 @@
1
- # Compatibility layer for using CMUDictExt with CMUDict-like API calls.
2
- # Designed to be compatible with the implementation of CMUDict in:
3
- # https://github.com/NVIDIA/DeepLearningExamples/
4
- #
5
- # Example usage:
6
- # from h2p_parser.compat.cmudict import CMUDict
7
-
8
- from h2p_parser.cmudictext import CMUDictExt
9
-
10
-
11
- class CMUDict(CMUDictExt):
12
- def __init__(self, file_or_path=None, heteronyms_path=None, keep_ambiguous=True):
13
- # Parameter Mapping:
14
- # file_or_path => Mapped to cmu_dict_path
15
- # heteronyms_path => Dropped as CMUDictExt uses H2p for heteronym parsing.
16
- # keep_ambiguous => Mapped to cmu_multi_mode | True => -2, False => -1
17
- super().__init__(file_or_path, heteronyms_path)
18
- self._entries = {}
19
- self.heteronyms = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/data/__init__.py DELETED
File without changes
resources/app/python/xvapitch/text/h2p_parser/data/cmudict-0.7b.txt DELETED
The diff for this file is too large to render. See raw diff
 
resources/app/python/xvapitch/text/h2p_parser/data/cmudict.dict DELETED
The diff for this file is too large to render. See raw diff
 
resources/app/python/xvapitch/text/h2p_parser/data/dict.json DELETED
@@ -1,1500 +0,0 @@
1
- {
2
- "absent": {
3
- "DEFAULT": "AE1 B S AH0 N T",
4
- "VERB": "AH1 B S AE1 N T"
5
- },
6
- "abstract": {
7
- "DEFAULT": "AE1 B S T R AE2 K T",
8
- "VERB": "AE0 B S T R AE1 K T"
9
- },
10
- "abstracts": {
11
- "DEFAULT": "AE1 B S T R AE0 K T S",
12
- "VERB": "AE0 B S T R AE1 K T S"
13
- },
14
- "abuse": {
15
- "DEFAULT": "AH0 B Y UW1 S",
16
- "VERB": "AH0 B Y UW1 Z"
17
- },
18
- "abuses": {
19
- "DEFAULT": "AH0 B Y UW1 S IH0 Z",
20
- "VERB": "AH0 B Y UW1 Z IH0 Z"
21
- },
22
- "accent": {
23
- "DEFAULT": "AE1 K S EH2 N T",
24
- "VERB": "AH0 K S EH1 N T"
25
- },
26
- "accents": {
27
- "DEFAULT": "AE1 K S EH0 N T S",
28
- "VERB": "AE1 K S EH0 N T S"
29
- },
30
- "addict": {
31
- "DEFAULT": "AE1 D IH2 K T",
32
- "VERB": "AH0 D IH1 K T"
33
- },
34
- "addicts": {
35
- "DEFAULT": "AE1 D IH2 K T S",
36
- "VERB": "AH0 D IH1 K T S"
37
- },
38
- "advocate": {
39
- "DEFAULT": "AE1 D V AH0 K AH0 T",
40
- "VERB": "AE1 D V AH0 K EY2 T"
41
- },
42
- "advocates": {
43
- "DEFAULT": "AE1 D V AH0 K AH0 T S",
44
- "VERB": "AE1 D V AH0 K EY2 T S"
45
- },
46
- "affect": {
47
- "DEFAULT": "AE1 F EH0 K T",
48
- "VERB": "AH0 F EH1 K T"
49
- },
50
- "affects": {
51
- "DEFAULT": "AE1 F EH0 K T S",
52
- "VERB": "AH0 F EH1 K T S"
53
- },
54
- "affix": {
55
- "DEFAULT": "AE1 F IH0 K S",
56
- "VERB": "AH0 F IH1 K S"
57
- },
58
- "affixes": {
59
- "DEFAULT": "AE1 F IH0 K S IH0 Z",
60
- "VERB": "AH0 F IH1 K S IH0 Z"
61
- },
62
- "agglomerate": {
63
- "DEFAULT": "AH0 G L AA1 M ER0 AH0 T",
64
- "VERB": "AH0 G L AA1 M ER0 EY2 T"
65
- },
66
- "aggregate": {
67
- "DEFAULT": "AE1 G R AH0 G AH0 T",
68
- "VERB": "AE1 G R AH0 G EY0 T"
69
- },
70
- "aggregates": {
71
- "DEFAULT": "AE1 G R AH0 G IH0 T S",
72
- "VERB": "AE1 G R AH0 G EY2 T S"
73
- },
74
- "allies": {
75
- "DEFAULT": "AE1 L AY0 Z",
76
- "VERB": "AH0 L AY1 Z"
77
- },
78
- "alloy": {
79
- "DEFAULT": "AE1 L OY2",
80
- "VERB": "AH0 L OY1"
81
- },
82
- "alloys": {
83
- "DEFAULT": "AE1 L OY2 Z",
84
- "VERB": "AH0 L OY1 Z"
85
- },
86
- "ally": {
87
- "DEFAULT": "AE1 L AY0",
88
- "VERB": "AH0 L AY1"
89
- },
90
- "alternate": {
91
- "DEFAULT": "AO0 L T ER1 N AH0 T",
92
- "VERB": "AO1 L T ER0 N EY2 T"
93
- },
94
- "analyses": {
95
- "DEFAULT": "AE1 N AH0 L AY0 Z IH2 Z",
96
- "VERB": "AH0 N AE1 L IH0 S IY2 Z"
97
- },
98
- "animate": {
99
- "DEFAULT": "AE1 N AH0 M AH0 T",
100
- "VERB": "AE1 N AH0 M EY2 T"
101
- },
102
- "annex": {
103
- "DEFAULT": "AE1 N EH2 K S",
104
- "VERB": "AH0 N EH1 K S"
105
- },
106
- "annexes": {
107
- "DEFAULT": "AE1 N EH2 K S IH0 Z",
108
- "VERB": "AH0 N EH1 K S IH0 Z"
109
- },
110
- "appropriate": {
111
- "DEFAULT": "AH0 P R OW1 P R IY0 AH0 T",
112
- "VERB": "AH0 P R OW1 P R IY0 EY2 T"
113
- },
114
- "approximate": {
115
- "DEFAULT": "AH0 P R AA1 K S AH0 M AH0 T",
116
- "VERB": "AH0 P R AA1 K S AH0 M EY2 T"
117
- },
118
- "articulate": {
119
- "DEFAULT": "AA0 R T IH1 K Y AH0 L EY2 T",
120
- "VERB": "AA0 R T IH1 K Y AH0 L AH0 T"
121
- },
122
- "aspirate": {
123
- "DEFAULT": "AE1 S P ER0 AH0 T",
124
- "VERB": "AE1 S P ER0 EY2 T"
125
- },
126
- "aspirates": {
127
- "DEFAULT": "AE1 S P ER0 AH0 T S",
128
- "VERB": "AE1 S P ER0 EY2 T S"
129
- },
130
- "associate": {
131
- "DEFAULT": "AH0 S OW1 S IY0 AH0 T",
132
- "VERB": "AH0 S OW1 S IY0 EY2 T"
133
- },
134
- "associates": {
135
- "DEFAULT": "AH0 S OW1 S IY0 AH0 T S",
136
- "VERB": "AH0 S OW1 S IY0 EY2 T S"
137
- },
138
- "attribute": {
139
- "DEFAULT": "AE1 T R IH0 B Y UW0 T",
140
- "VERB": "AH0 T R IH1 B Y UW2 T"
141
- },
142
- "attributes": {
143
- "DEFAULT": "AE1 T R IH0 B Y UW0 T S",
144
- "VERB": "AH0 T R IH1 B Y UW2 T S"
145
- },
146
- "baths": {
147
- "DEFAULT": "B AE1 DH Z",
148
- "VERB": "B AE1 TH S"
149
- },
150
- "blessed": {
151
- "DEFAULT": "B L EH1 S T",
152
- "VERB": "B L EH1 S IH0 D"
153
- },
154
- "certificate": {
155
- "DEFAULT": "S ER0 T IH1 F IH0 K EY2 T",
156
- "VERB": "S ER0 T IH1 F IH0 K AH0 T"
157
- },
158
- "certificates": {
159
- "DEFAULT": "S ER0 T IH1 F IH0 K AH0 T S",
160
- "VERB": "S ER0 T IH1 F IH0 K EY2 T S"
161
- },
162
- "close": {
163
- "DEFAULT": "K L OW1 S",
164
- "VERB": "K L OW1 Z"
165
- },
166
- "closer": {
167
- "DEFAULT": "K L OW1 S ER0",
168
- "NOUN": "K L OW1 Z ER0"
169
- },
170
- "closes": {
171
- "DEFAULT": "K L OW1 S IH0 Z",
172
- "VERB": "K L OW1 Z IH0 Z"
173
- },
174
- "collect": {
175
- "DEFAULT": "K AA1 L EH0 K T",
176
- "VERB": "K AH0 L EH1 K T"
177
- },
178
- "collects": {
179
- "DEFAULT": "K AA1 L EH0 K T S",
180
- "VERB": "K AH0 L EH1 K T S"
181
- },
182
- "combat": {
183
- "DEFAULT": "K AA1 M B AE0 T",
184
- "VERB": "K AH0 M B AE1 T"
185
- },
186
- "combats": {
187
- "DEFAULT": "K AH1 M B AE0 T S",
188
- "VERB": "K AH0 M B AE1 T S"
189
- },
190
- "combine": {
191
- "DEFAULT": "K AA1 M B AY0 N",
192
- "VERB": "K AH0 M B AY1 N"
193
- },
194
- "commune": {
195
- "DEFAULT": "K AA1 M Y UW0 N",
196
- "VERB": "K AH0 M Y UW1 N"
197
- },
198
- "communes": {
199
- "DEFAULT": "K AA1 M Y UW0 N Z",
200
- "VERB": "K AH0 M Y UW1 N Z"
201
- },
202
- "compact": {
203
- "DEFAULT": "K AA1 M P AE0 K T",
204
- "VERB": "K AH0 M P AE1 K T"
205
- },
206
- "compacts": {
207
- "DEFAULT": "K AA1 M P AE0 K T S",
208
- "VERB": "K AH0 M P AE1 K T S"
209
- },
210
- "complex": {
211
- "ADJ": "K AH0 M P L EH1 K S",
212
- "DEFAULT": " K AA1 M P L EH0 K S"
213
- },
214
- "compliment": {
215
- "DEFAULT": "K AA1 M P L AH0 M AH0 N T",
216
- "VERB": "K AA1 M P L AH0 M EH0 N T"
217
- },
218
- "compliments": {
219
- "DEFAULT": "K AA1 M P L AH0 M AH0 N T S",
220
- "VERB": "K AA1 M P L AH0 M EH0 N T S"
221
- },
222
- "compound": {
223
- "DEFAULT": "K AA1 M P AW0 N D",
224
- "VERB": "K AH0 M P AW1 N D"
225
- },
226
- "compounds": {
227
- "DEFAULT": "K AA1 M P AW0 N D Z",
228
- "VERB": "K AH0 M P AW1 N D Z"
229
- },
230
- "compress": {
231
- "DEFAULT": "K AA1 M P R EH0 S",
232
- "VERB": "K AH0 M P R EH1 S"
233
- },
234
- "compresses": {
235
- "DEFAULT": "K AA1 M P R EH0 S AH0 Z",
236
- "VERB": "K AH0 M P R EH1 S IH0 Z"
237
- },
238
- "concert": {
239
- "DEFAULT": "K AA1 N S ER0 T",
240
- "VERB": "K AH0 N S ER1 T"
241
- },
242
- "concerts": {
243
- "DEFAULT": "K AA1 N S ER0 T S",
244
- "VERB": "K AH0 N S ER1 T S"
245
- },
246
- "conduct": {
247
- "DEFAULT": "K AA1 N D AH0 K T",
248
- "VERB": "K AA0 N D AH1 K T"
249
- },
250
- "confederate": {
251
- "DEFAULT": "K AH0 N F EH1 D ER0 AH0 T",
252
- "VERB": "K AH0 N F EH1 D ER0 EY2 T"
253
- },
254
- "confederates": {
255
- "DEFAULT": "K AH0 N F EH1 D ER0 AH0 T S",
256
- "VERB": "K AH0 N F EH1 D ER0 EY2 T S"
257
- },
258
- "confines": {
259
- "DEFAULT": "K AA1 N F AY2 N Z",
260
- "VERB": "K AH0 N F AY1 N Z"
261
- },
262
- "conflict": {
263
- "DEFAULT": "K AA1 N F L IH0 K T",
264
- "VERB": "K AH0 N F L IH1 K T"
265
- },
266
- "conflicts": {
267
- "DEFAULT": "K AA1 N F L IH0 K T S",
268
- "VERB": "K AH0 N F L IH1 K T S"
269
- },
270
- "conglomerate": {
271
- "DEFAULT": "K AH0 N G L AA1 M ER0 AH0 T",
272
- "VERB": "K AH0 N G L AA1 M ER0 EY2 T"
273
- },
274
- "conglomerates": {
275
- "DEFAULT": "K AH0 N G L AA1 M ER0 AH0 T S",
276
- "VERB": "K AH0 N G L AA1 M ER0 EY2 T S"
277
- },
278
- "conscript": {
279
- "DEFAULT": "K AA1 N S K R IH0 P T",
280
- "VERB": "K AH0 N S K R IH1 P T"
281
- },
282
- "conscripts": {
283
- "DEFAULT": "K AA1 N S K R IH0 P T S",
284
- "VERB": "K AH0 N S K R IH1 P T S"
285
- },
286
- "console": {
287
- "DEFAULT": "K AA1 N S OW0 L",
288
- "VERB": "K AH0 N S OW1 L"
289
- },
290
- "consoles": {
291
- "DEFAULT": "K AA1 N S OW0 L Z",
292
- "VERB": "K AH0 N S OW1 L Z"
293
- },
294
- "consort": {
295
- "DEFAULT": "K AA1 N S AO0 R T",
296
- "VERB": "K AH0 N S AO1 R T"
297
- },
298
- "construct": {
299
- "DEFAULT": "K AA1 N S T R AH0 K T",
300
- "VERB": "K AH0 N S T R AH1 K T"
301
- },
302
- "constructs": {
303
- "DEFAULT": "K AA1 N S T R AH0 K T S",
304
- "VERB": "K AH0 N S T R AH1 K T S"
305
- },
306
- "consummate": {
307
- "DEFAULT": "K AA0 N S AH1 M AH0 T",
308
- "VERB": "K AA1 N S AH0 M EY2 T"
309
- },
310
- "content": {
311
- "DEFAULT": "K AH0 N T EH1 N T",
312
- "NOUN": "K AA1 N T EH0 N T"
313
- },
314
- "contents": {
315
- "DEFAULT": "K AA1 N T EH0 N T S",
316
- "VERB": "K AH0 N T EH1 N T S"
317
- },
318
- "contest": {
319
- "DEFAULT": "K AA1 N T EH0 S T",
320
- "VERB": "K AH0 N T EH1 S T"
321
- },
322
- "contests": {
323
- "DEFAULT": "K AA1 N T EH0 S T S",
324
- "VERB": "K AH0 N T EH1 S T S"
325
- },
326
- "contract": {
327
- "DEFAULT": "K AA1 N T R AE2 K T",
328
- "VERB": "K AH0 N T R AE1 K T"
329
- },
330
- "contracts": {
331
- "DEFAULT": "K AA1 N T R AE2 K T S",
332
- "VERB": "K AH0 N T R AE1 K T S"
333
- },
334
- "contrast": {
335
- "DEFAULT": "K AA1 N T R AE0 S T",
336
- "VERB": "K AH0 N T R AE1 S T"
337
- },
338
- "contrasts": {
339
- "DEFAULT": "K AA1 N T R AE0 S T S",
340
- "VERB": "K AH0 N T R AE1 S T S"
341
- },
342
- "converse": {
343
- "DEFAULT": "K AA1 N V ER0 S",
344
- "VERB": "K AH0 N V ER1 S"
345
- },
346
- "convert": {
347
- "DEFAULT": "K AA1 N V ER0 T",
348
- "VERB": "K AH0 N V ER1 T"
349
- },
350
- "converts": {
351
- "DEFAULT": "K AA1 N V ER0 T S",
352
- "VERB": "K AH0 N V ER1 T S"
353
- },
354
- "convict": {
355
- "DEFAULT": "K AA1 N V IH0 K T",
356
- "VERB": "K AH0 N V IH1 K T"
357
- },
358
- "convicts": {
359
- "DEFAULT": "K AA1 N V IH0 K T S",
360
- "VERB": "K AH0 N V IH1 K T S"
361
- },
362
- "coordinate": {
363
- "DEFAULT": "K OW0 AO1 R D AH0 N AH0 T",
364
- "VERB": "K OW0 AO1 R D AH0 N EY2 T"
365
- },
366
- "coordinates": {
367
- "DEFAULT": "K OW0 AO1 R D AH0 N AH0 T S",
368
- "VERB": "K OW0 AO1 R D AH0 N EY2 T S"
369
- },
370
- "counterbalance": {
371
- "DEFAULT": "K AW2 N T ER0 B AE1 L AH0 N S",
372
- "VERB": "K AW1 N T ER0 B AE2 L AH0 N S"
373
- },
374
- "counterbalances": {
375
- "DEFAULT": "K AW1 N T ER0 B AE2 L AH0 N S IH0 Z",
376
- "VERB": "K AW2 N T ER0 B AE1 L AH0 N S IH0 Z"
377
- },
378
- "crabbed": {
379
- "DEFAULT": "K R AE1 B IH0 D",
380
- "VERB": "K R AE1 B D"
381
- },
382
- "crooked": {
383
- "DEFAULT": "K R UH1 K AH0 D",
384
- "VERB": "K R UH1 K T"
385
- },
386
- "curate": {
387
- "DEFAULT": "K Y UH1 R AH0 T",
388
- "VERB": "K Y UH0 R AH1 T"
389
- },
390
- "cursed": {
391
- "DEFAULT": "K ER1 S IH0 D",
392
- "VERB": "K ER1 S T"
393
- },
394
- "decoy": {
395
- "DEFAULT": "D IY1 K OY0",
396
- "VERB": "D IY0 K OY1"
397
- },
398
- "decoys": {
399
- "DEFAULT": "D IY1 K OY0 Z",
400
- "VERB": "D IY0 K OY1 Z"
401
- },
402
- "decrease": {
403
- "DEFAULT": "D IY1 K R IY2 S",
404
- "VERB": "D IH0 K R IY1 S"
405
- },
406
- "decreases": {
407
- "DEFAULT": "D IY1 K R IY2 S IH0 Z",
408
- "VERB": "D IH0 K R IY1 S IH0 Z"
409
- },
410
- "defect": {
411
- "DEFAULT": "D IY1 F EH0 K T",
412
- "VERB": "D IH0 F EH1 K T"
413
- },
414
- "defects": {
415
- "DEFAULT": "D IY1 F EH0 K T S",
416
- "VERB": "D IH0 F EH1 K T S"
417
- },
418
- "degenerate": {
419
- "DEFAULT": "D IH0 JH EH1 N ER0 AH0 T",
420
- "VERB": "D IH0 JH EH1 N ER0 EY2 T"
421
- },
422
- "degenerates": {
423
- "DEFAULT": "D IH0 JH EH1 N ER0 AH0 T S",
424
- "VERB": "D IH0 JH EH1 N ER0 EY2 T S"
425
- },
426
- "delegate": {
427
- "DEFAULT": "D EH1 L AH0 G AH0 T",
428
- "VERB": "D EH1 L AH0 G EY2 T"
429
- },
430
- "delegates": {
431
- "DEFAULT": "D EH1 L AH0 G AH0 T S",
432
- "VERB": "D EH1 L AH0 G EY2 T S"
433
- },
434
- "deliberate": {
435
- "DEFAULT": "D IH0 L IH1 B ER0 AH0 T",
436
- "VERB": "D IH0 L IH1 B ER0 EY2 T"
437
- },
438
- "desert": {
439
- "DEFAULT": "D EH1 Z ER0 T",
440
- "VERB": "D IH0 Z ER1 T"
441
- },
442
- "deserts": {
443
- "DEFAULT": "D EH1 Z ER0 T S",
444
- "VERB": "D IH0 Z ER1 T S"
445
- },
446
- "desolate": {
447
- "DEFAULT": "D EH1 S AH0 L AH0 T",
448
- "VERB": "D EH1 S AH0 L EY2 T"
449
- },
450
- "diagnoses": {
451
- "DEFAULT": "D AY2 AH0 G N OW1 S IY0 Z",
452
- "VERB": "D AY1 AH0 G N OW2 Z IY0 Z"
453
- },
454
- "dictate": {
455
- "DEFAULT": "D IH1 K T EY2 T",
456
- "VERB": "D IH0 K T EY1 T"
457
- },
458
- "dictates": {
459
- "DEFAULT": "D IH1 K T EY2 T S",
460
- "VERB": "D IH0 K T EY1 T S"
461
- },
462
- "diffuse": {
463
- "DEFAULT": "D IH0 F Y UW1 S",
464
- "VERB": "D IH0 F Y UW1 Z"
465
- },
466
- "digest": {
467
- "DEFAULT": "D AY1 JH EH0 S T",
468
- "VERB": "D AY0 JH EH1 S T"
469
- },
470
- "digests": {
471
- "DEFAULT": "D AY1 JH EH0 S T S",
472
- "VERB": "D AY2 JH EH1 S T S"
473
- },
474
- "discard": {
475
- "DEFAULT": "D IH1 S K AA0 R D",
476
- "VERB": "D IH0 S K AA1 R D"
477
- },
478
- "discards": {
479
- "DEFAULT": "D IH1 S K AA0 R D Z",
480
- "VERB": "D IH0 S K AA1 R D Z"
481
- },
482
- "discharge": {
483
- "DEFAULT": "D IH1 S CH AA2 R JH",
484
- "VERB": "D IH0 S CH AA1 R JH"
485
- },
486
- "discharges": {
487
- "DEFAULT": "D IH1 S CH AA2 R JH AH0 Z",
488
- "VERB": "D IH0 S CH AA1 R JH AH0 Z"
489
- },
490
- "discount": {
491
- "DEFAULT": "D IH1 S K AW0 N T",
492
- "VERB": "D IH0 S K AW1 N T"
493
- },
494
- "discounts": {
495
- "DEFAULT": "D IH1 S K AW2 N T S",
496
- "VERB": "D IH0 S K AW1 N T S"
497
- },
498
- "discourse": {
499
- "DEFAULT": "D IH1 S K AO0 R S",
500
- "VERB": "D IH0 S K AO1 R S"
501
- },
502
- "discourses": {
503
- "DEFAULT": "D IH1 S K AO0 R S IH0 Z",
504
- "VERB": "D IH0 S K AO1 R S IH0 Z"
505
- },
506
- "document": {
507
- "DEFAULT": "D AA1 K Y AH0 M AH0 N T",
508
- "VERB": "D AA1 K Y UW0 M EH0 N T"
509
- },
510
- "documents": {
511
- "DEFAULT": "D AA1 K Y AH0 M AH0 N T S",
512
- "VERB": "D AA1 K Y UW0 M EH0 N T S"
513
- },
514
- "dogged": {
515
- "DEFAULT": "D AO1 G D",
516
- "VERB": "D AO1 G IH0 D"
517
- },
518
- "duplicate": {
519
- "DEFAULT": "D UW1 P L AH0 K AH0 T",
520
- "VERB": "D UW1 P L AH0 K EY2 T"
521
- },
522
- "duplicates": {
523
- "DEFAULT": "D UW1 P L AH0 K AH0 T S",
524
- "VERB": "D UW1 P L AH0 K EY2 T S"
525
- },
526
- "ejaculate": {
527
- "DEFAULT": "IH0 JH AE1 K Y UW0 L AH0 T",
528
- "VERB": "IH0 JH AE1 K Y UW0 L EY2 T"
529
- },
530
- "ejaculates": {
531
- "DEFAULT": "IH0 JH AE1 K Y UW0 L AH0 T S",
532
- "VERB": "IH0 JH AE1 K Y UW0 L EY2 T S"
533
- },
534
- "elaborate": {
535
- "DEFAULT": "IH0 L AE1 B R AH0 T",
536
- "VERB": "IH0 L AE1 B ER0 EY2 T"
537
- },
538
- "entrance": {
539
- "DEFAULT": "EH1 N T R AH0 N S",
540
- "VERB": "IH0 N T R AH1 N S"
541
- },
542
- "entrances": {
543
- "DEFAULT": "EH1 N T R AH0 N S AH0 Z",
544
- "VERB": "IH0 N T R AH1 N S AH0 Z"
545
- },
546
- "envelope": {
547
- "DEFAULT": "EH1 N V AH0 L OW2 P",
548
- "VERB": "IH0 N V EH1 L AH0 P"
549
- },
550
- "envelopes": {
551
- "DEFAULT": "EH1 N V AH0 L OW2 P S",
552
- "VERB": "IH0 N V EH1 L AH0 P S"
553
- },
554
- "escort": {
555
- "DEFAULT": "EH1 S K AO0 R T",
556
- "VERB": "EH0 S K AO1 R T"
557
- },
558
- "escorts": {
559
- "DEFAULT": "EH1 S K AO0 R T S",
560
- "VERB": "EH0 S K AO1 R T S"
561
- },
562
- "essay": {
563
- "DEFAULT": "EH1 S EY2",
564
- "VERB": "EH0 S EY1"
565
- },
566
- "essays": {
567
- "DEFAULT": "EH1 S EY2 Z",
568
- "VERB": "EH0 S EY1 Z"
569
- },
570
- "estimate": {
571
- "DEFAULT": "EH1 S T AH0 M AH0 T",
572
- "VERB": "EH1 S T AH0 M EY2 T"
573
- },
574
- "estimates": {
575
- "DEFAULT": "EH1 S T AH0 M AH0 T S",
576
- "VERB": "EH1 S T AH0 M EY2 T S"
577
- },
578
- "excess": {
579
- "DEFAULT": "EH1 K S EH2 S",
580
- "VERB": "IH0 K S EH1 S"
581
- },
582
- "excise": {
583
- "DEFAULT": "EH1 K S AY0 Z",
584
- "VERB": "EH0 K S AY1 S"
585
- },
586
- "excuse": {
587
- "DEFAULT": "IH0 K S K Y UW1 S",
588
- "VERB": "IH0 K S K Y UW1 Z"
589
- },
590
- "excuses": {
591
- "DEFAULT": "IH0 K S K Y UW1 S IH0 Z",
592
- "VERB": "IH0 K S K Y UW1 Z IH0 Z"
593
- },
594
- "expatriate": {
595
- "DEFAULT": "EH0 K S P EY1 T R IY0 AH0 T",
596
- "VERB": "EH0 K S P EY1 T R IY0 EY2 T"
597
- },
598
- "expatriates": {
599
- "DEFAULT": "EH0 K S P EY1 T R IY0 AH0 T S",
600
- "VERB": "EH0 K S P EY1 T R IY0 EY2 T S"
601
- },
602
- "exploit": {
603
- "DEFAULT": "EH2 K S P L OY1 T",
604
- "VERB": "EH1 K S P L OY2 T"
605
- },
606
- "exploits": {
607
- "DEFAULT": "EH2 K S P L OY1 T S",
608
- "VERB": "EH1 K S P L OY2 T S"
609
- },
610
- "export": {
611
- "DEFAULT": "EH1 K S P AO0 R T",
612
- "VERB": "IH0 K S P AO1 R T"
613
- },
614
- "exports": {
615
- "DEFAULT": "EH1 K S P AO0 R T S",
616
- "VERB": "IH0 K S P AO1 R T S"
617
- },
618
- "extract": {
619
- "DEFAULT": "EH1 K S T R AE2 K T",
620
- "VERB": "IH0 K S T R AE1 K T"
621
- },
622
- "extracts": {
623
- "DEFAULT": "EH1 K S T R AE2 K T S",
624
- "VERB": "IH0 K S T R AE1 K T S"
625
- },
626
- "ferment": {
627
- "DEFAULT": "F ER1 M EH0 N T",
628
- "VERB": "F ER0 M EH1 N T"
629
- },
630
- "ferments": {
631
- "DEFAULT": "F ER1 M EH0 N T S",
632
- "VERB": "F ER0 M EH1 N T S"
633
- },
634
- "fragment": {
635
- "DEFAULT": "F R AE0 G M EH1 N T",
636
- "VERB": "F R AE1 G M AH0 N T"
637
- },
638
- "fragments": {
639
- "DEFAULT": "F R AE1 G M AH0 N T S",
640
- "VERB": "F R AE0 G M EH1 N T S"
641
- },
642
- "frequent": {
643
- "DEFAULT": "F R IY1 K W AH0 N T",
644
- "VERB": "F R IY1 K W EH2 N T"
645
- },
646
- "graduate": {
647
- "DEFAULT": "G R AE1 JH AH0 W AH0 T",
648
- "VERB": "G R AE1 JH AH0 W EY2 T"
649
- },
650
- "graduates": {
651
- "DEFAULT": "G R AE1 JH AH0 W AH0 T S",
652
- "VERB": "G R AE1 JH AH0 W EY2 T S"
653
- },
654
- "house": {
655
- "DEFAULT": "HH AW1 S",
656
- "VERB": "HH AW1 Z"
657
- },
658
- "impact": {
659
- "DEFAULT": "IH1 M P AE0 K T",
660
- "VERB": "IH2 M P AE1 K T"
661
- },
662
- "impacts": {
663
- "DEFAULT": "IH1 M P AE0 K T S",
664
- "VERB": "IH2 M P AE1 K T S"
665
- },
666
- "implant": {
667
- "DEFAULT": "IH1 M P L AE2 N T",
668
- "VERB": "IH2 M P L AE1 N T"
669
- },
670
- "implants": {
671
- "DEFAULT": "IH1 M P L AE2 N T S",
672
- "VERB": "IH2 M P L AE1 N T S"
673
- },
674
- "implement": {
675
- "DEFAULT": "IH1 M P L AH0 M AH0 N T",
676
- "VERB": "IH1 M P L AH0 M EH0 N T"
677
- },
678
- "implements": {
679
- "DEFAULT": "IH1 M P L AH0 M AH0 N T S",
680
- "VERB": "IH1 M P L AH0 M EH0 N T S"
681
- },
682
- "import": {
683
- "DEFAULT": "IH1 M P AO2 R T",
684
- "VERB": "IH2 M P AO1 R T"
685
- },
686
- "imports": {
687
- "DEFAULT": "IH1 M P AO2 R T S",
688
- "VERB": "IH2 M P AO1 R T S"
689
- },
690
- "impress": {
691
- "DEFAULT": "IH1 M P R EH0 S",
692
- "VERB": "IH0 M P R EH1 S"
693
- },
694
- "imprint": {
695
- "DEFAULT": "IH2 M P R IH1 N T",
696
- "VERB": "IH1 M P R IH0 N T"
697
- },
698
- "imprints": {
699
- "DEFAULT": "IH1 M P R IH0 N T S",
700
- "VERB": "IH2 M P R IH1 N T S"
701
- },
702
- "incense": {
703
- "DEFAULT": "IH1 N S EH2 N S",
704
- "VERB": "IH2 N S EH1 N S"
705
- },
706
- "incline": {
707
- "DEFAULT": "IH1 N K L AY0 N",
708
- "VERB": "IH2 N K L AY1 N"
709
- },
710
- "inclines": {
711
- "DEFAULT": "IH1 N K L AY0 N Z",
712
- "VERB": "IH2 N K L AY1 N Z"
713
- },
714
- "incorporate": {
715
- "DEFAULT": "IH2 N K AO1 R P ER0 AH0 T",
716
- "VERB": "IH2 N K AO1 R P ER0 EY2 T"
717
- },
718
- "increase": {
719
- "DEFAULT": "IH1 N K R IY2 S",
720
- "VERB": "IH2 N K R IY1 S"
721
- },
722
- "increases": {
723
- "DEFAULT": "IH1 N K R IY2 S IH0 Z",
724
- "VERB": "IH2 N K R IY1 S IH0 Z"
725
- },
726
- "indent": {
727
- "DEFAULT": "IH1 N D EH0 N T",
728
- "VERB": "IH2 N D EH1 N T"
729
- },
730
- "indents": {
731
- "DEFAULT": "IH1 N D EH0 N T S",
732
- "VERB": "IH2 N D EH1 N T S"
733
- },
734
- "inebriate": {
735
- "DEFAULT": "IH2 N EH1 B R IY0 AH0 T",
736
- "VERB": "IH2 N EH1 B R IY0 EY2 T"
737
- },
738
- "inebriates": {
739
- "DEFAULT": "IH2 N EH1 B R IY0 AH0 T S",
740
- "VERB": "IH2 N EH1 B R IY0 EY2 T S"
741
- },
742
- "initiate": {
743
- "DEFAULT": "IH2 N IH1 SH IY0 AH0 T",
744
- "VERB": "IH2 N IH1 SH IY0 EY2 T"
745
- },
746
- "initiates": {
747
- "DEFAULT": "IH2 N IH1 SH IY0 AH0 T S",
748
- "VERB": "IH2 N IH1 SH IY0 EY2 T S"
749
- },
750
- "inlay": {
751
- "DEFAULT": "IH1 N L EY2",
752
- "VERB": "IH2 N L EY1"
753
- },
754
- "inlays": {
755
- "DEFAULT": "IH1 N L EY2 Z",
756
- "VERB": "IH2 N L EY1 Z"
757
- },
758
- "insert": {
759
- "DEFAULT": "IH1 N S ER2 T",
760
- "VERB": "IH2 N S ER1 T"
761
- },
762
- "inserts": {
763
- "DEFAULT": "IH1 N S ER2 T S",
764
- "VERB": "IH2 N S ER1 T S"
765
- },
766
- "inset": {
767
- "DEFAULT": "IH1 N S EH2 T",
768
- "VERB": "IH2 N S EH1 T"
769
- },
770
- "insets": {
771
- "DEFAULT": "IH1 N S EH2 T S",
772
- "VERB": "IH2 N S EH1 T S"
773
- },
774
- "instinct": {
775
- "DEFAULT": "IH1 N S T IH0 NG K T",
776
- "VERB": "IH2 N S T IH1 NG K T"
777
- },
778
- "insult": {
779
- "DEFAULT": "IH1 N S AH2 L T",
780
- "VERB": "IH2 N S AH1 L T"
781
- },
782
- "insults": {
783
- "DEFAULT": "IH1 N S AH2 L T S",
784
- "VERB": "IH2 N S AH1 L T S"
785
- },
786
- "interchange": {
787
- "DEFAULT": "IH1 N T ER0 CH EY2 N JH",
788
- "VERB": "IH2 T ER0 CH EY1 N JH"
789
- },
790
- "interchanges": {
791
- "DEFAULT": "IH1 N T ER0 CH EY2 N JH IH0 Z",
792
- "VERB": "IH2 T ER0 CH EY1 N JH IH0 Z"
793
- },
794
- "interdict": {
795
- "DEFAULT": "IH1 N T ER0 D IH2 K T",
796
- "VERB": "IH2 N T ER0 D IH1 K T"
797
- },
798
- "interdicts": {
799
- "DEFAULT": "IH1 N T ER0 D IH2 K T S",
800
- "VERB": "IH2 N T ER0 D IH1 K T S"
801
- },
802
- "intern": {
803
- "DEFAULT": "IH1 N T ER0 N",
804
- "VERB": "IH0 N T ER1 N"
805
- },
806
- "interns": {
807
- "DEFAULT": "IH1 N T ER0 N Z",
808
- "VERB": "IH0 N T ER1 N Z"
809
- },
810
- "intimate": {
811
- "DEFAULT": "IH1 N T AH0 M AH0 T",
812
- "VERB": "IH1 N T IH0 M EY2 T"
813
- },
814
- "intimates": {
815
- "DEFAULT": "IH1 N T AH0 M AH0 T S",
816
- "VERB": "IH1 N T IH0 M EY2 T S"
817
- },
818
- "intrigue": {
819
- "DEFAULT": "IH1 N T R IY0 G",
820
- "VERB": "IH2 N T R IY1 G"
821
- },
822
- "introvert": {
823
- "DEFAULT": "IH1 N T R AO0 V ER2 T",
824
- "VERB": "IH2 N T R AO0 V ER1 T"
825
- },
826
- "introverts": {
827
- "DEFAULT": "IH1 N T R AO0 V ER2 T S",
828
- "VERB": "IH2 N T R AO0 V ER1 T S"
829
- },
830
- "inverse": {
831
- "DEFAULT": "IH2 N V ER1 S",
832
- "VERB": "IH1 N V ER0 S"
833
- },
834
- "invite": {
835
- "DEFAULT": "IH1 N V AY0 T",
836
- "VERB": "IH2 N V AY1 T"
837
- },
838
- "invites": {
839
- "DEFAULT": "IH1 N V AY0 T S",
840
- "VERB": "IH2 N V AY1 T S"
841
- },
842
- "jagged": {
843
- "DEFAULT": "JH AE1 G IH0 D",
844
- "VERB": "JH AE1 G D"
845
- },
846
- "learned": {
847
- "DEFAULT": "L ER1 N D",
848
- "VERB": "L ER1 N IH0 D"
849
- },
850
- "legitimate": {
851
- "DEFAULT": "L AH0 JH IH1 T AH0 M AH0 T",
852
- "VERB": "L AH0 JH IH1 T AH0 M EY2 T"
853
- },
854
- "live": {
855
- "DEFAULT": "L AY1 V",
856
- "VERB": "L IH1 V"
857
- },
858
- "lives": {
859
- "DEFAULT": "L AY1 V Z",
860
- "VERB": "L IH1 V Z"
861
- },
862
- "mandate": {
863
- "DEFAULT": "M AE2 N D EY1 T",
864
- "VERB": "M AE1 N D EY2 T"
865
- },
866
- "misconduct": {
867
- "DEFAULT": "M IH2 S K AA0 N D AH1 K T",
868
- "VERB": "M IH2 S K AA1 N D AH0 K T"
869
- },
870
- "misprint": {
871
- "DEFAULT": "M IH1 S P R IH0 N T",
872
- "VERB": "M IH2 S P R IH1 N T"
873
- },
874
- "misprints": {
875
- "DEFAULT": "M IH1 S P R IH0 N T S",
876
- "VERB": "M IH2 S P R IH1 N T S"
877
- },
878
- "misuse": {
879
- "DEFAULT": "M IH0 S Y UW1 Z",
880
- "VERB": "M IH0 S Y UW1 S"
881
- },
882
- "misuses": {
883
- "DEFAULT": "M IH0 S Y UW1 S IH0 Z",
884
- "VERB": "M IH0 S Y UW1 Z IH0 Z"
885
- },
886
- "moderate": {
887
- "DEFAULT": "M AA1 D ER0 AH0 T",
888
- "VERB": "M AA1 D ER0 EY2 T"
889
- },
890
- "moderates": {
891
- "DEFAULT": "M AA1 D ER0 AH0 T S",
892
- "VERB": "M AA1 D ER0 EY2 T S"
893
- },
894
- "mouth": {
895
- "DEFAULT": "M AW1 DH",
896
- "VERB": "M AW1 TH"
897
- },
898
- "mouths": {
899
- "DEFAULT": "M AW1 TH S",
900
- "VERB": "M AW1 DH Z"
901
- },
902
- "object": {
903
- "DEFAULT": "AA1 B JH EH0 K T",
904
- "VERB": "AH0 B JH EH1 K T"
905
- },
906
- "objects": {
907
- "DEFAULT": "AA1 B JH EH0 K T S",
908
- "VERB": "AH0 B JH EH1 K T S"
909
- },
910
- "ornament": {
911
- "DEFAULT": "AO1 R N AH0 M AH0 N T",
912
- "VERB": "AO1 R N AH0 M EH0 N T"
913
- },
914
- "ornaments": {
915
- "DEFAULT": "AO1 R N AH0 M AH0 N T S",
916
- "VERB": "AO1 R N AH0 M EH0 N T S"
917
- },
918
- "overcharge": {
919
- "DEFAULT": "OW1 V ER0 CH AA2 R JH",
920
- "VERB": "OW2 V ER0 CH AA1 R JH"
921
- },
922
- "overcharges": {
923
- "DEFAULT": "OW1 V ER0 CH AA2 R JH IH0 Z",
924
- "VERB": "OW2 V ER0 CH AA1 R JH IH0 Z"
925
- },
926
- "overflow": {
927
- "DEFAULT": "OW1 V ER0 F L OW2",
928
- "VERB": "OW2 V ER0 F L OW1"
929
- },
930
- "overflows": {
931
- "DEFAULT": "OW1 V ER0 F L OW2 Z",
932
- "VERB": "OW2 V ER0 F L OW1 Z"
933
- },
934
- "overhang": {
935
- "DEFAULT": "OW1 V ER0 HH AE2 NG",
936
- "VERB": "OW2 V ER0 HH AE1 NG"
937
- },
938
- "overhangs": {
939
- "DEFAULT": "OW1 V ER0 HH AE2 NG Z",
940
- "VERB": "OW2 V ER0 HH AE1 NG Z"
941
- },
942
- "overhaul": {
943
- "DEFAULT": "OW1 V ER0 HH AO2 L",
944
- "VERB": "OW2 V ER0 HH AO1 L"
945
- },
946
- "overhauls": {
947
- "DEFAULT": "OW1 V ER0 HH AO2 L Z",
948
- "VERB": "OW2 V ER0 HH AO1 L Z"
949
- },
950
- "overlap": {
951
- "DEFAULT": "OW1 V ER0 L AE2 P",
952
- "VERB": "OW2 V ER0 L AE1 P"
953
- },
954
- "overlaps": {
955
- "DEFAULT": "OW1 V ER0 L AE2 P S",
956
- "VERB": "OW2 V ER0 L AE1 P S"
957
- },
958
- "overlay": {
959
- "DEFAULT": "OW1 V ER0 L EY2",
960
- "VERB": "OW2 V ER0 L EY1"
961
- },
962
- "overlays": {
963
- "DEFAULT": "OW1 V ER0 L EY2 Z",
964
- "VERB": "OW2 V ER0 L EY1 Z"
965
- },
966
- "overwork": {
967
- "DEFAULT": "OW1 V ER0 W ER2 K",
968
- "VERB": "OW2 V ER0 W ER1 K"
969
- },
970
- "perfect": {
971
- "DEFAULT": "P ER1 F IH2 K T",
972
- "VERB": "P ER0 F EH1 K T"
973
- },
974
- "perfume": {
975
- "DEFAULT": "P ER1 F Y UW0 M",
976
- "VERB": "P ER0 F Y UW1 M"
977
- },
978
- "perfumes": {
979
- "DEFAULT": "P ER1 F Y UW0 M Z",
980
- "VERB": "P ER0 F Y UW1 M Z"
981
- },
982
- "permit": {
983
- "DEFAULT": "P ER1 M IH2 T",
984
- "VERB": "P ER0 M IH1 T"
985
- },
986
- "permits": {
987
- "DEFAULT": "P ER1 M IH2 T S",
988
- "VERB": "P ER0 M IH1 T S"
989
- },
990
- "pervert": {
991
- "DEFAULT": "P ER1 V ER0 T",
992
- "VERB": "P ER0 V ER1 T"
993
- },
994
- "perverts": {
995
- "DEFAULT": "P ER1 V ER0 T S",
996
- "VERB": "P ER0 V ER1 T S"
997
- },
998
- "pontificate": {
999
- "DEFAULT": "P AA0 N T IH1 F AH0 K EY2 T",
1000
- "VERB": "P AA0 N T IH1 F AH0 K AH0 T"
1001
- },
1002
- "pontificates": {
1003
- "DEFAULT": "P AA0 N T IH1 F AH0 K AH0 T S",
1004
- "VERB": "P AA0 N T IH1 F AH0 K EY2 T S"
1005
- },
1006
- "precipitate": {
1007
- "DEFAULT": "P R IH0 S IH1 P IH0 T EY2 T",
1008
- "VERB": "P R IH0 S IH1 P IH0 T AH0 T"
1009
- },
1010
- "predicate": {
1011
- "DEFAULT": "P R EH1 D AH0 K EY2 T",
1012
- "VERB": "P R EH1 D IH0 K AH0 T"
1013
- },
1014
- "predicates": {
1015
- "DEFAULT": "P R EH1 D IH0 K AH0 T S",
1016
- "VERB": "P R EH1 D AH0 K EY2 T S"
1017
- },
1018
- "prefix": {
1019
- "DEFAULT": "P R IY1 F IH0 K S",
1020
- "VERB": "P R IY2 F IH1 K S"
1021
- },
1022
- "prefixes": {
1023
- "DEFAULT": "P R IY1 F IH0 K S IH0 JH",
1024
- "VERB": "P R IY2 F IH1 K S IH0 JH"
1025
- },
1026
- "presage": {
1027
- "DEFAULT": "P R EH1 S IH0 JH",
1028
- "VERB": "P R EH2 S IH1 JH"
1029
- },
1030
- "presages": {
1031
- "DEFAULT": "P R EH1 S IH0 JH IH0 JH",
1032
- "VERB": "P R EH2 S IH1 JH IH0 JH"
1033
- },
1034
- "present": {
1035
- "DEFAULT": "P R EH1 Z AH0 N T",
1036
- "VERB": "P R IY0 Z EH1 N T"
1037
- },
1038
- "presents": {
1039
- "DEFAULT": "P R EH1 Z AH0 N T S",
1040
- "VERB": "P R IY0 Z EH1 N T S"
1041
- },
1042
- "proceeds": {
1043
- "DEFAULT": "P R OW1 S IY0 D Z",
1044
- "VERB": "P R AH0 S IY1 D Z"
1045
- },
1046
- "process": {
1047
- "DEFAULT": "P R AA1 S EH2 S",
1048
- "VERB": "P R AO2 S EH1 S"
1049
- },
1050
- "processes": {
1051
- "DEFAULT": "P R AO2 S EH1 S AH0 Z",
1052
- "VERB": "P R AA1 S EH0 S AH0 Z"
1053
- },
1054
- "processing": {
1055
- "DEFAULT": "P R AA1 S EH0 S IH0 NG",
1056
- "VERB": "P R AA0 S EH1 S IH0 NG"
1057
- },
1058
- "produce": {
1059
- "DEFAULT": "P R OW1 D UW0 S",
1060
- "VERB": "P R AH0 D UW1 S"
1061
- },
1062
- "progress": {
1063
- "DEFAULT": "P R AA1 G R EH2 S",
1064
- "VERB": "P R AH0 G R EH1 S"
1065
- },
1066
- "progresses": {
1067
- "DEFAULT": "P R AA1 G R EH2 S AH0 Z",
1068
- "VERB": "P R OW0 G R EH1 S AH0 Z"
1069
- },
1070
- "project": {
1071
- "DEFAULT": "P R AA1 JH EH0 K T",
1072
- "VERB": "P R AA0 JH EH1 K T"
1073
- },
1074
- "projects": {
1075
- "DEFAULT": "P R AA1 JH EH0 K T S",
1076
- "VERB": "P R AA0 JH EH1 K T S"
1077
- },
1078
- "prospect": {
1079
- "DEFAULT": "P R AA1 S P EH0 K T",
1080
- "VERB": "P R AH2 S P EH1 K T"
1081
- },
1082
- "prospects": {
1083
- "DEFAULT": "P R AA1 S P EH0 K T S",
1084
- "VERB": "P R AH2 S P EH1 K T S"
1085
- },
1086
- "prostrate": {
1087
- "DEFAULT": "P R AA1 S T R EY0 T",
1088
- "VERB": "P R AA0 S T R EY1 T"
1089
- },
1090
- "protest": {
1091
- "DEFAULT": "P R OW1 T EH2 S T",
1092
- "VERB": "P R AH0 T EH1 S T"
1093
- },
1094
- "protests": {
1095
- "DEFAULT": "P R OW1 T EH2 S T S",
1096
- "VERB": "P R AH0 T EH1 S T S"
1097
- },
1098
- "purport": {
1099
- "DEFAULT": "P ER1 P AO2 R T",
1100
- "VERB": "P ER0 P AO1 R T"
1101
- },
1102
- "quadruple": {
1103
- "DEFAULT": "K W AA0 D R UW1 P AH0 L",
1104
- "VERB": "K W AA1 D R UW0 P AH0 L"
1105
- },
1106
- "quadruples": {
1107
- "DEFAULT": "K W AA1 D R UW0 P AH0 L Z",
1108
- "VERB": "K W AA0 D R UW1 P AH0 L Z"
1109
- },
1110
- "ragged": {
1111
- "DEFAULT": "R AE1 G AH0 D",
1112
- "VERB": "R AE1 G D"
1113
- },
1114
- "rampage": {
1115
- "DEFAULT": "R AE1 M P EY2 JH",
1116
- "VERB": "R AE2 M P EY1 JH"
1117
- },
1118
- "rampages": {
1119
- "DEFAULT": "R AE1 M P EY2 JH IH0 Z",
1120
- "VERB": "R AE2 M P EY1 JH IH0 Z"
1121
- },
1122
- "read": {
1123
- "DEFAULT": "R IY1 D",
1124
- "VBD": "R EH1 D",
1125
- "VBN": "R EH1 D",
1126
- "VBP": "R EH1 D"
1127
- },
1128
- "rebel": {
1129
- "DEFAULT": "R IH0 B EH1 L",
1130
- "VERB": "R EH1 B AH0 L"
1131
- },
1132
- "rebels": {
1133
- "DEFAULT": "R EH1 B AH0 L Z",
1134
- "VERB": "R IH0 B EH1 L Z"
1135
- },
1136
- "rebound": {
1137
- "DEFAULT": "R IY1 B AW0 N D",
1138
- "VERB": "R IY0 B AW1 N D"
1139
- },
1140
- "rebounds": {
1141
- "DEFAULT": "R IY1 B AW0 N D Z",
1142
- "VERB": "R IY0 B AW1 N D Z"
1143
- },
1144
- "recall": {
1145
- "DEFAULT": "R IY1 K AO2 L",
1146
- "VERB": "R IH0 K AO1 L"
1147
- },
1148
- "recalls": {
1149
- "DEFAULT": "R IY1 K AO2 L Z",
1150
- "VERB": "R IH0 K AO1 L Z"
1151
- },
1152
- "recap": {
1153
- "DEFAULT": "R IY1 K AE2 P",
1154
- "VERB": "R IH0 K AE1 P"
1155
- },
1156
- "recapped": {
1157
- "DEFAULT": "R IY1 K AE2 P T",
1158
- "VERB": "R IH0 K AE1 P T"
1159
- },
1160
- "recapping": {
1161
- "DEFAULT": "R IY1 K AE2 P IH0 NG",
1162
- "VERB": "R IH0 K AE1 P IH0 NG"
1163
- },
1164
- "recaps": {
1165
- "DEFAULT": "R IY1 K AE2 P S",
1166
- "VERB": "R IH0 K AE1 P S"
1167
- },
1168
- "record": {
1169
- "DEFAULT": "R EH1 K ER0 D",
1170
- "VERB": "R IH0 K AO1 R D"
1171
- },
1172
- "records": {
1173
- "DEFAULT": "R EH1 K ER0 D Z",
1174
- "VERB": "R IH0 K AO1 R D Z"
1175
- },
1176
- "recount": {
1177
- "DEFAULT": " R IH1 K AW0 N T",
1178
- "VERB": "R IY2 K AW1 N T"
1179
- },
1180
- "recounts": {
1181
- "DEFAULT": " R IH1 K AW0 N T S",
1182
- "VERB": "R IY2 K AW1 N T S"
1183
- },
1184
- "refill": {
1185
- "DEFAULT": "R IY1 F IH0 L",
1186
- "VERB": "R IY0 F IH1 L"
1187
- },
1188
- "refills": {
1189
- "DEFAULT": "R IY1 F IH0 L Z",
1190
- "VERB": "R IY0 F IH1 L Z"
1191
- },
1192
- "refit": {
1193
- "DEFAULT": "R IY1 F IH0 T",
1194
- "VERB": "R IY0 F IH1 T"
1195
- },
1196
- "refits": {
1197
- "DEFAULT": "R IY1 F IH0 T S",
1198
- "VERB": "R IY0 F IH1 T S"
1199
- },
1200
- "refresh": {
1201
- "DEFAULT": "R IH1 F R EH0 SH",
1202
- "VERB": "R IH0 F R EH1 SH"
1203
- },
1204
- "refund": {
1205
- "DEFAULT": "R IY1 F AH2 N D",
1206
- "VERB": "R IH0 F AH1 N D"
1207
- },
1208
- "refunds": {
1209
- "DEFAULT": "R IY1 F AH2 N D Z",
1210
- "VERB": "R IH0 F AH1 N D Z"
1211
- },
1212
- "refuse": {
1213
- "DEFAULT": "R EH1 F Y UW2 Z",
1214
- "VERB": "R IH0 F Y UW1 Z"
1215
- },
1216
- "regenerate": {
1217
- "DEFAULT": "R IY0 JH EH1 N ER0 AH0 T",
1218
- "VERB": "R IY0 JH EH1 N ER0 EY2 T"
1219
- },
1220
- "rehash": {
1221
- "DEFAULT": "R IY1 HH AE0 SH",
1222
- "VERB": "R IY0 HH AE1 SH"
1223
- },
1224
- "rehashes": {
1225
- "DEFAULT": "R IY1 HH AE0 SH IH0 Z",
1226
- "VERB": "R IY0 HH AE1 SH IH0 Z"
1227
- },
1228
- "reincarnate": {
1229
- "DEFAULT": "R IY2 IH0 N K AA1 R N AH0 T",
1230
- "VERB": "R IY2 IH0 N K AA1 R N EY2 T"
1231
- },
1232
- "reject": {
1233
- "DEFAULT": "R IY1 JH EH0 K T",
1234
- "VERB": "R IH0 JH EH1 K T"
1235
- },
1236
- "rejects": {
1237
- "DEFAULT": "R IY1 JH EH0 K T S",
1238
- "VERB": "R IH0 JH EH1 K T S"
1239
- },
1240
- "relay": {
1241
- "DEFAULT": "R IY1 L EY2",
1242
- "VERB": "R IY2 L EY1"
1243
- },
1244
- "relaying": {
1245
- "DEFAULT": "R IY1 L EY2 IH0 NG",
1246
- "VERB": "R IY2 L EY1 IH0 NG"
1247
- },
1248
- "relays": {
1249
- "DEFAULT": "R IY1 L EY2 Z",
1250
- "VERB": "R IY2 L EY1 Z"
1251
- },
1252
- "remake": {
1253
- "DEFAULT": "R IY1 M EY0 K",
1254
- "VERB": "R IY2 M EY1 K"
1255
- },
1256
- "remakes": {
1257
- "DEFAULT": "R IY1 M EY0 K S",
1258
- "VERB": "R IY2 M EY1 K S"
1259
- },
1260
- "replay": {
1261
- "DEFAULT": "R IY1 P L EY0",
1262
- "VERB": "R IY0 P L EY1"
1263
- },
1264
- "replays": {
1265
- "DEFAULT": "R IY1 P L EY0 Z",
1266
- "VERB": "R IY0 P L EY1 Z"
1267
- },
1268
- "reprint": {
1269
- "DEFAULT": "R IY1 P R IH0 N T",
1270
- "VERB": "R IY0 P R IH1 N T"
1271
- },
1272
- "reprints": {
1273
- "DEFAULT": "R IY1 P R IH0 N T S",
1274
- "VERB": "R IY0 P R IH1 N T S"
1275
- },
1276
- "rerun": {
1277
- "DEFAULT": "R IY1 R AH0 N",
1278
- "VERB": "R IY2 R AH1 N"
1279
- },
1280
- "reruns": {
1281
- "DEFAULT": "R IY1 R AH0 N Z",
1282
- "VERB": "R IY2 R AH1 N Z"
1283
- },
1284
- "resume": {
1285
- "DEFAULT": "R EH1 Z AH0 M EY2",
1286
- "VERB": "R IY0 Z UW1 M"
1287
- },
1288
- "retake": {
1289
- "DEFAULT": "R IY1 T EY0 K",
1290
- "VERB": "R IY0 T EY1 K"
1291
- },
1292
- "retakes": {
1293
- "DEFAULT": "R IY1 T EY0 K S",
1294
- "VERB": "R IY0 T EY1 K S"
1295
- },
1296
- "rethink": {
1297
- "DEFAULT": "R IY1 TH IH0 NG K",
1298
- "VERB": "R IY2 TH IH1 NG K"
1299
- },
1300
- "rethinks": {
1301
- "DEFAULT": "R IY1 TH IH0 NG K S",
1302
- "VERB": "R IY2 TH IH1 NG K S"
1303
- },
1304
- "retread": {
1305
- "DEFAULT": "R IY1 T R EH0 D",
1306
- "VERB": "R IY2 T R EH1 D"
1307
- },
1308
- "retreads": {
1309
- "DEFAULT": "R IY1 T R EH0 D Z",
1310
- "VERB": "R IY2 T R EH1 D Z"
1311
- },
1312
- "rewrite": {
1313
- "DEFAULT": "R IY1 R AY2 T",
1314
- "VERB": "R IY0 R AY1 T"
1315
- },
1316
- "rewrites": {
1317
- "DEFAULT": "R IY1 R AY2 T S",
1318
- "VERB": "R IY0 R AY1 T S"
1319
- },
1320
- "segment": {
1321
- "DEFAULT": "S EH2 G M EH1 N T",
1322
- "VERB": "S EH1 G M AH0 N T"
1323
- },
1324
- "segments": {
1325
- "DEFAULT": "S EH1 G M AH0 N T S",
1326
- "VERB": "S EH2 G M EH1 N T S"
1327
- },
1328
- "separate": {
1329
- "DEFAULT": "S EH1 P ER0 IH0 T",
1330
- "VERB": "S EH1 P ER0 EY2 T"
1331
- },
1332
- "separates": {
1333
- "DEFAULT": "S EH1 P ER0 IH0 T S",
1334
- "VERB": "S EH1 P ER0 EY2 T S"
1335
- },
1336
- "subcontract": {
1337
- "DEFAULT": "S AH2 B K AA0 N T R AE1 K T",
1338
- "VERB": "S AH0 B K AA1 N T R AE2 K T"
1339
- },
1340
- "subcontracts": {
1341
- "DEFAULT": "S AH0 B K AA1 N T R AE2 K T S",
1342
- "VERB": "S AH2 B K AA0 N T R AE1 K T S"
1343
- },
1344
- "subject": {
1345
- "DEFAULT": "S AH1 B JH IH0 K T",
1346
- "VERB": "S AH0 B JH EH1 K T"
1347
- },
1348
- "subjects": {
1349
- "DEFAULT": "S AH1 B JH IH0 K T S",
1350
- "VERB": "S AH0 B JH EH1 K T S"
1351
- },
1352
- "subordinate": {
1353
- "DEFAULT": "S AH0 B AO1 R D AH0 N AH0 T",
1354
- "VERB": "S AH0 B AO1 R D AH0 N EY2 T"
1355
- },
1356
- "subordinates": {
1357
- "DEFAULT": "S AH0 B AO1 R D AH0 N AH0 T S",
1358
- "VERB": "S AH0 B AO1 R D AH0 N EY2 T S"
1359
- },
1360
- "supplement": {
1361
- "DEFAULT": "S AH1 P L AH0 M AH0 N T",
1362
- "VERB": "S AH1 P L AH0 M EH0 N T"
1363
- },
1364
- "supplements": {
1365
- "DEFAULT": "S AH1 P L AH0 M AH0 N T S",
1366
- "VERB": "S AH1 P L AH0 M EH0 N T S"
1367
- },
1368
- "surmise": {
1369
- "DEFAULT": "S ER1 M AY0 Z",
1370
- "VERB": "S ER0 M AY1 Z"
1371
- },
1372
- "surmises": {
1373
- "DEFAULT": "S ER1 M AY0 Z IH0 Z",
1374
- "VERB": "S ER0 M AY1 Z IH0 Z"
1375
- },
1376
- "survey": {
1377
- "DEFAULT": "S ER1 V EY2",
1378
- "VERB": "S ER0 V EY1"
1379
- },
1380
- "surveys": {
1381
- "DEFAULT": "S ER1 V EY2 Z",
1382
- "VERB": "S ER0 V EY1 Z"
1383
- },
1384
- "suspect": {
1385
- "DEFAULT": "S AH1 S P EH2 K T",
1386
- "VERB": "S AH0 S P EH1 K T"
1387
- },
1388
- "suspects": {
1389
- "DEFAULT": "S AH1 S P EH2 K T S",
1390
- "VERB": "S AH0 S P EH1 K T S"
1391
- },
1392
- "syndicate": {
1393
- "DEFAULT": "S IH1 N D IH0 K AH0 T",
1394
- "VERB": "S IH1 N D AH0 K EY2 T"
1395
- },
1396
- "syndicates": {
1397
- "DEFAULT": "S IH1 N D IH0 K AH0 T S",
1398
- "VERB": "S IH1 N D IH0 K EY2 T S"
1399
- },
1400
- "torment": {
1401
- "DEFAULT": "T AO0 R M EH1 N T",
1402
- "VERB": "T AO1 R M EH2 N T"
1403
- },
1404
- "transfer": {
1405
- "DEFAULT": "T R AE1 N S F ER0",
1406
- "VERB": "T R AE0 N S F ER1"
1407
- },
1408
- "transfers": {
1409
- "DEFAULT": "T R AE1 N S F ER0 Z",
1410
- "VERB": "T R AE0 N S F ER1 Z"
1411
- },
1412
- "transplant": {
1413
- "DEFAULT": "T R AE1 N S P L AE0 N T",
1414
- "VERB": "T R AE0 N S P L AE1 N T"
1415
- },
1416
- "transplants": {
1417
- "DEFAULT": "T R AE1 N S P L AE0 N T S",
1418
- "VERB": "T R AE0 N S P L AE1 N T S"
1419
- },
1420
- "transport": {
1421
- "DEFAULT": "T R AE1 N S P AO0 R T",
1422
- "VERB": "T R AE0 N S P AO1 R T"
1423
- },
1424
- "transports": {
1425
- "DEFAULT": "T R AE1 N S P AO0 R T S",
1426
- "VERB": "T R AE0 N S P AO1 R T S"
1427
- },
1428
- "triplicate": {
1429
- "DEFAULT": "T R IH1 P L IH0 K AH0 T",
1430
- "VERB": "T R IH1 P L IH0 K EY2 T"
1431
- },
1432
- "triplicates": {
1433
- "DEFAULT": "T R IH1 P L IH0 K AH0 T S",
1434
- "VERB": "T R IH1 P L IH0 K EY2 T S"
1435
- },
1436
- "undercut": {
1437
- "DEFAULT": "AH1 N D ER0 K AH2 T",
1438
- "VERB": "AH2 N D ER0 K AH1 T"
1439
- },
1440
- "underestimate": {
1441
- "DEFAULT": "AH1 N D ER0 EH1 S T AH0 M AH0 T",
1442
- "VERB": "AH1 N D ER0 EH1 S T AH0 M EY2 T"
1443
- },
1444
- "underestimates": {
1445
- "DEFAULT": "AH1 N D ER0 EH1 S T AH0 M AH0 T S",
1446
- "VERB": "AH1 N D ER0 EH1 S T AH0 M EY2 T S"
1447
- },
1448
- "underline": {
1449
- "DEFAULT": "AH1 N D ER0 L AY2 N",
1450
- "VERB": "AH2 N D ER0 L AY1 N"
1451
- },
1452
- "underlines": {
1453
- "DEFAULT": "AH1 N D ER0 L AY2 N Z",
1454
- "VERB": "AH2 N D ER0 L AY1 N Z"
1455
- },
1456
- "undertaking": {
1457
- "DEFAULT": "AH1 N D ER0 T EY2 K IH0 NG",
1458
- "VERB": "AH2 N D ER0 T EY1 K IH0 NG"
1459
- },
1460
- "undertakings": {
1461
- "DEFAULT": "AH1 N D ER0 T EY2 K IH0 NG Z",
1462
- "VERB": "AH2 N D ER0 T EY1 K IH0 NG Z"
1463
- },
1464
- "unused": {
1465
- "DEFAULT": "AH0 N Y UW1 S T",
1466
- "VERB": "AH0 N Y UW1 Z D"
1467
- },
1468
- "upgrade": {
1469
- "DEFAULT": "AH1 P G R EY0 D",
1470
- "VERB": "AH0 P G R EY1 D"
1471
- },
1472
- "upgrades": {
1473
- "DEFAULT": "AH1 P G R EY0 D Z",
1474
- "VERB": "AH0 P G R EY1 D Z"
1475
- },
1476
- "uplift": {
1477
- "DEFAULT": "AH1 P L IH0 F T",
1478
- "VERB": "AH2 P L IH1 F T"
1479
- },
1480
- "upset": {
1481
- "DEFAULT": "AH1 P S EH2 T",
1482
- "VERB": "AH0 P S EH1 T"
1483
- },
1484
- "upsets": {
1485
- "DEFAULT": "AH1 P S EH2 T S",
1486
- "VERB": "AH0 P S EH1 T S"
1487
- },
1488
- "use": {
1489
- "DEFAULT": "Y UW1 S",
1490
- "VERB": "Y UW1 Z"
1491
- },
1492
- "used": {
1493
- "DEFAULT": "Y UW1 S T",
1494
- "VBN": "Y UW1 Z D"
1495
- },
1496
- "uses": {
1497
- "DEFAULT": "Y UW1 S IH0 Z",
1498
- "VERB": "Y UW1 Z IH0 Z"
1499
- }
1500
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/data/example.json DELETED
@@ -1,16 +0,0 @@
1
- {
2
- "absent": {
3
- "VERB": "AH1 B S AE1 N T",
4
- "DEFAULT": "AE1 B S AH0 N T"
5
- },
6
- "reject": {
7
- "VERB": "R IH0 JH EH1 K T",
8
- "DEFAULT": "R IY1 JH EH0 K T"
9
- },
10
- "read": {
11
- "VBD": "R EH1 D",
12
- "VBN": "R EH1 D",
13
- "VBP": "R EH1 D",
14
- "DEFAULT": "R IY1 D"
15
- }
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/dict_reader.py DELETED
@@ -1,109 +0,0 @@
1
- # This reads a CMUDict formatted dictionary as a dictionary object
2
- import re
3
- from python.xvapitch.text.h2p_parser.format_ph import format_ph as ph
4
- from . import DATA_PATH
5
-
6
-
7
- _dict_primary = 'cmudict.dict'
8
-
9
-
10
- def read_dict(filename: str) -> list:
11
- # Read the file
12
- with open(filename, encoding='utf-8', mode='r') as f:
13
- # Read the file into lines
14
- lines = f.readlines()
15
- # Remove any line starting with ";;;"
16
- lines = [line for line in lines if not line.startswith(';;;')]
17
- return lines
18
-
19
-
20
- def parse_dict(lines: list) -> dict:
21
- # Create a dictionary to store the parsed data
22
- parsed_dict = {}
23
- # Detect file format
24
-
25
- # We will read the first 10 lines to determine the format
26
- # Default to SSD format unless we find otherwise
27
- dict_form = 'SSD'
28
- for line in lines[:10]:
29
- # Strip new lines
30
- line = line.strip()
31
- if line == '':
32
- continue
33
- """
34
- Format 1 (Double Space Delimited):
35
- - Comment allowed to start with ";;;"
36
- WORD W ER1 D
37
-
38
- Format 2 (Single Space Delimited):
39
- - Comment allowed at end of any line using "#"
40
- WORD W ER1 D # Comment
41
- """
42
- if ' ' in line:
43
- dict_form = 'DSD'
44
- break
45
-
46
- # Iterate over the lines
47
- for line in lines:
48
- # Skip empty lines and lines with no space
49
- line = line.strip()
50
- if line == '' and ' ' not in line:
51
- continue
52
-
53
- # Split depending on format
54
- if dict_form == 'DSD':
55
- pairs = line.split(' ')
56
- else:
57
- space_index = line.find(' ')
58
- line_split = line[:space_index], line[space_index + 1:]
59
- pairs = line_split[0], line_split[1].split('#')[0]
60
-
61
- word = str.lower(pairs[0]) # Get word and lowercase it
62
- phonemes = ph.to_list(pairs[1]) # Convert to list of phonemes
63
- phonemes = [phonemes] # Wrap in nested list
64
- word_num = 0
65
- word_orig = None
66
-
67
- # Detect if this is a multi-word entry
68
- if ('(' in word) and (')' in word) and any(char.isdigit() for char in word):
69
- # Parse the integer from the word using regex
70
- result = int(re.findall(r"\((\d+)\)", word)[0])
71
- # If found
72
- if result is not None:
73
- # Set the original word
74
- word_orig = word
75
- # Remove the integer and bracket from the word
76
- word = re.sub(r"\(.*\)", "", word)
77
- # Set the word number to the result
78
- word_num = result
79
-
80
- # Check existing key
81
- if word in parsed_dict:
82
- # If word number is 0, ignore
83
- if word_num == 0:
84
- continue
85
- # If word number is not 0, add phoneme to existing key at index
86
- parsed_dict[word].extend(phonemes)
87
- # Also add the original word if it exists
88
- if word_orig is not None:
89
- parsed_dict[word_orig] = phonemes
90
- else:
91
- # Create a new key
92
- parsed_dict[word] = phonemes
93
-
94
- # Return the dictionary
95
- return parsed_dict
96
-
97
-
98
- class DictReader:
99
- def __init__(self, filename=None):
100
- self.filename = filename
101
- self.dict = {}
102
- # If filename is None, use the default dictionary
103
- # default = 'data' uses the dictionary file in the data module
104
- # default = 'nltk' uses the nltk cmudict
105
- if filename is not None:
106
- self.dict = parse_dict(read_dict(filename))
107
- else:
108
- with DATA_PATH.joinpath(_dict_primary) as f:
109
- self.dict = parse_dict(read_dict(f))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/dictionary.py DELETED
@@ -1,85 +0,0 @@
1
- # dictionary.py
2
-
3
- # Defines a dictionary class that can be used to store and retrieve from the json file
4
- import sys
5
- if sys.version_info < (3, 9):
6
- # In Python versions below 3.9, this is needed
7
- import importlib_resources as pkg_resources
8
- else:
9
- # Since python 3.9+, importlib.resources.files is built-in
10
- import importlib.resources as pkg_resources
11
- from os.path import exists
12
- import json
13
- from python.xvapitch.text.h2p_parser.pos_parser import pos_parser as pos_parser
14
-
15
-
16
- # Method to get data path
17
- def get_data_path():
18
- data_path = pkg_resources.files('h2p_parser.data')
19
- if data_path is None:
20
- raise FileNotFoundError("Data folder not found")
21
- return data_path
22
-
23
-
24
- # Dictionary class
25
- class Dictionary:
26
- def __init__(self, file_name=None):
27
- # If a file name is not provided, use the default file name
28
- self.file_name = file_name
29
- if file_name is None:
30
- self.file_name = 'dict.json'
31
- self.use_default = True
32
- else:
33
- self.file_name = file_name
34
- self.use_default = False
35
- self.dictionary = {}
36
- self.dictionary = self.load_dictionary(file_name)
37
-
38
- # Loads the dictionary from the json file
39
- def load_dictionary(self, path=None):
40
- if path is None:
41
- data_path = get_data_path()
42
- dict_path = data_path.joinpath(self.file_name)
43
- with open(str(dict_path)) as def_file:
44
- read_dict = json.load(def_file)
45
- else:
46
- if not exists(path):
47
- raise FileNotFoundError(f'Dictionary {self.file_name} file not found')
48
- with open(path) as file:
49
- try:
50
- read_dict = json.load(file)
51
- except json.decoder.JSONDecodeError:
52
- raise ValueError(f'Dictionary {self.file_name} file is not valid JSON')
53
- # Check dictionary has at least one entry
54
- if len(read_dict) == 0:
55
- raise ValueError('Dictionary is empty or invalid')
56
- return read_dict
57
-
58
- # Check if a word is in the dictionary
59
- def contains(self, word):
60
- word = word.lower()
61
- return word in self.dictionary
62
-
63
- # Get the phonetic pronunciation of a word using Part of Speech tag
64
- def get_phoneme(self, word, pos):
65
- # Get the sub-dictionary at dictionary[word]
66
- sub_dict = self.dictionary[word.lower()]
67
-
68
- # First, check if the exact pos is a key
69
- if pos in sub_dict:
70
- return sub_dict[pos]
71
-
72
- # If not, use the parent pos of the pos tag
73
- parent_pos = pos_parser.get_parent_pos(pos)
74
-
75
- if parent_pos is not None:
76
- # Check if the sub_dict contains the parent pos
77
- if parent_pos in sub_dict:
78
- return sub_dict[parent_pos]
79
-
80
- # If not, check if the sub_dict contains a DEFAULT key
81
- if 'DEFAULT' in sub_dict:
82
- return sub_dict['DEFAULT']
83
-
84
- # If no matches, return None
85
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/filter.py DELETED
@@ -1,34 +0,0 @@
1
- from unicodedata import normalize
2
- import re
3
-
4
- # Pre-compile regex
5
- re_filter = re.compile(r"[^ A-Za-z'.,?!()\-]")
6
- re_filter_with_num = re.compile(r"[^ A-Za-z\d'.,?!()\-]")
7
- re_multi_space = re.compile(r"\s\s+")
8
-
9
-
10
- # Filters text before parsing
11
- # @param text: text to be filtered
12
- # @return: filtered text
13
- def filter_text(text: str, allow_num: bool = False, preserve_case: bool = False) -> str:
14
- """
15
- Filters text before parsing
16
- :param preserve_case:
17
- :param allow_num: True if numbers are allowed
18
- :param text: Input raw text
19
- :return: Text after stripped accents, lower-cased, and invalid punctuation removed
20
- """
21
- # Strip accents
22
- text = normalize('NFD', text)
23
- # To lowercase
24
- if not preserve_case:
25
- text = text.lower()
26
- # Remove all invalid punctuation
27
- if allow_num:
28
- text = re.sub(re_filter_with_num, '', text)
29
- else:
30
- text = re.sub(re_filter, "", text)
31
- # Remove all spaces more than 1
32
- text = re.sub(re_multi_space, " ", text)
33
- # Return
34
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/format_ph.py DELETED
@@ -1,99 +0,0 @@
1
- from typing import overload
2
-
3
- # Converts and outputs various formats of phonemes
4
-
5
-
6
- @overload
7
- def to_sds(ph: str) -> str: ...
8
-
9
-
10
- @overload
11
- def to_sds(ph: list) -> str: ...
12
-
13
-
14
- def to_sds(ph: list or str) -> str or None:
15
- """
16
- Converts phonemes to space delimited string format
17
-
18
- :param ph: Phoneme as str or list, supports nested lists
19
- :return: Phoneme as space delimited string
20
- """
21
- # Return None if None
22
- if ph is None:
23
- return None
24
-
25
- # Return directly if str
26
- if isinstance(ph, str):
27
- return ph
28
- # If is list, convert each element
29
- if isinstance(ph, list):
30
- # If list empty, return None
31
- if len(ph) == 0:
32
- return None
33
- # Case for further lists
34
- if isinstance(ph[0], list):
35
- return to_sds(ph[0]) # Recursive call
36
- # Case if str at index 0, and size 1, return directly
37
- elif isinstance(ph[0], str) and len(ph) == 1:
38
- return ph[0]
39
- # Case if str at index 0, above size 1, return with join
40
- elif isinstance(ph[0], str):
41
- return ' '.join(ph)
42
- # Case for none
43
- elif ph[0] is None:
44
- return None
45
- else:
46
- raise TypeError('to_sds() encountered an unexpected nested element type')
47
- # Error if no matches
48
- raise TypeError('to_sds() expects a list or string')
49
-
50
-
51
- @overload
52
- def to_list(ph: str) -> list: ...
53
-
54
-
55
- @overload
56
- def to_list(ph: list) -> list: ...
57
-
58
-
59
- def to_list(ph: str or list) -> list or None:
60
- """
61
- Converts phonemes to list format
62
-
63
- :param ph: Phoneme as str or list, supports nested lists
64
- :return: Phoneme as list
65
- """
66
- # Return None if None
67
- if ph is None:
68
- return None
69
-
70
- # Return directly if list and index 0 is str
71
- if isinstance(ph, list) and len(ph) > 0 and isinstance(ph[0], str):
72
- return ph
73
-
74
- # If space delimited string, convert to list
75
- if isinstance(ph, str):
76
- return ph.split(' ')
77
-
78
- # If nested list, convert each element
79
- if isinstance(ph, list):
80
- # If list empty or has None, return None
81
- if len(ph) == 0 or ph[0] is None:
82
- return None
83
- # Case for further lists
84
- if isinstance(ph[0], list):
85
- return to_list(ph[0]) # Recursive call
86
-
87
- # Error if no matches
88
- raise TypeError('to_list() expects a list or string')
89
-
90
-
91
- # Surrounds text with curly brackets
92
- def with_cb(text: str) -> str:
93
- """
94
- Surrounds text with curly brackets
95
-
96
- :param text: Text to surround
97
- :return: Surrounded text
98
- """
99
- return '{' + text + '}'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/h2p.py DELETED
@@ -1,123 +0,0 @@
1
- import nltk
2
- import re
3
- from nltk.tokenize import TweetTokenizer
4
- from nltk import pos_tag
5
- from nltk import pos_tag_sents
6
- from .dictionary import Dictionary
7
- from .filter import filter_text as ft
8
- from . import format_ph as ph
9
-
10
- # Check that the nltk data is downloaded, if not, download it
11
- try:
12
- nltk.data.find('taggers/averaged_perceptron_tagger.zip')
13
- except LookupError:
14
- nltk.download('averaged_perceptron_tagger')
15
-
16
-
17
- # Method to use Regex to replace the first instance of a word with its phonemes
18
- def replace_first(target, replacement, text):
19
- # Skip if target invalid
20
- if target is None or target == '':
21
- return text
22
- # Replace the first instance of a word with its phonemes
23
- return re.sub(r'(?i)\b' + target + r'\b', replacement, text, 1)
24
-
25
-
26
- class H2p:
27
- def __init__(self, dict_path=None, preload=False, phoneme_format=''):
28
- """
29
- Creates a H2p parser
30
-
31
- Supported phoneme formats:
32
- - Space delimited
33
- - Space delimited surrounded by { }
34
-
35
- :param dict_path: Path to a heteronym dictionary json file. Built-in dictionary will be used if None
36
- :type dict_path: str
37
- :param preload: Preloads the tokenizer and tagger during initialization
38
- :type preload: bool
39
- """
40
-
41
- # Supported phoneme formats
42
- self.phoneme_format = phoneme_format
43
- self.dict = Dictionary(dict_path)
44
- self.tokenize = TweetTokenizer().tokenize
45
- self.get_tags = pos_tag
46
- if preload:
47
- self.preload()
48
-
49
- # Method to preload tokenizer and pos_tag
50
- def preload(self):
51
- tokens = self.tokenize('a')
52
- assert tokens == ['a']
53
- assert pos_tag(tokens)[0][0] == 'a'
54
-
55
- # Method to check if a text line contains a heteronym
56
- def contains_het(self, text):
57
- # Filter the text
58
- text = ft(text)
59
- # Tokenize
60
- words = self.tokenize(text)
61
- # Check match with dictionary
62
- hets = []
63
- for word in words:
64
- if self.dict.contains(word):
65
- hets.append(word)
66
- return len(hets)>0, hets
67
-
68
- # Method to replace heteronyms in a text line to phonemes
69
- def replace_het(self, text):
70
- # Filter the text
71
- working_text = ft(text, preserve_case=True)
72
- # Tokenize
73
- words = self.tokenize(working_text)
74
- # Get pos tags
75
- tags = pos_tag(words)
76
- # Loop through words and pos tags
77
- for word, pos in tags:
78
- # Skip if word not in dictionary
79
- if not self.dict.contains(word):
80
- continue
81
- # Get phonemes
82
- phonemes = self.dict.get_phoneme(word, pos)
83
- # Format phonemes
84
- f_ph = ph.with_cb(ph.to_sds(phonemes))
85
- # Replace word with phonemes
86
- text = replace_first(word, f_ph, text)
87
- return text
88
-
89
- # Replaces heteronyms in a list of text lines
90
- # Slightly faster than replace_het() called on each line
91
- def replace_het_list(self, text_list):
92
- # Filter the text
93
- working_text_list = [ft(text, preserve_case=True) for text in text_list]
94
- # Tokenize
95
- list_sentence_words = [self.tokenize(text) for text in working_text_list]
96
- # Get pos tags list
97
- tags_list = pos_tag_sents(list_sentence_words)
98
- # Loop through lines
99
- for index in range(len(tags_list)):
100
- # Loop through words and pos tags in tags_list index
101
- for word, pos in tags_list[index]:
102
- # Skip if word not in dictionary
103
- if not self.dict.contains(word):
104
- continue
105
- # Get phonemes
106
- phonemes = self.dict.get_phoneme(word, pos)
107
- # Format phonemes
108
- f_ph = ph.with_cb(ph.to_sds(phonemes))
109
- # Replace word with phonemes
110
- text_list[index] = replace_first(word, f_ph, text_list[index])
111
- return text_list
112
-
113
- # Method to tag a text line, returns a list of tags
114
- def tag(self, text):
115
- # Filter the text
116
- working_text = ft(text, preserve_case=True)
117
- # Tokenize
118
- words = self.tokenize(working_text)
119
- # Get pos tags
120
- tags = pos_tag(words)
121
- # Only return element 1 of each list
122
- return [tag[1] for tag in tags]
123
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/PKG-INFO DELETED
@@ -1,14 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: h2p-parser
3
- Version: 1.0.0
4
- Summary: Heteronym to Phoneme Parser
5
- Home-page: https://github.com/ionite34/h2p-parser
6
- Author: ionite
7
- Author-email: dev@ionite.io
8
- License: Apache 2.0
9
- Platform: UNKNOWN
10
- Requires-Python: >=3.7
11
- License-File: LICENSE
12
-
13
- UNKNOWN
14
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/SOURCES.txt DELETED
@@ -1,19 +0,0 @@
1
- LICENSE
2
- README.md
3
- setup.py
4
- h2p_parser/__init__.py
5
- h2p_parser/__main__.py
6
- h2p_parser/cmudictext.py
7
- h2p_parser/dict_reader.py
8
- h2p_parser/dictionary.py
9
- h2p_parser/filter.py
10
- h2p_parser/format_ph.py
11
- h2p_parser/h2p.py
12
- h2p_parser/pos_parser.py
13
- h2p_parser/processors.py
14
- h2p_parser/symbols.py
15
- h2p_parser/h2p_parser.egg-info/PKG-INFO
16
- h2p_parser/h2p_parser.egg-info/SOURCES.txt
17
- h2p_parser/h2p_parser.egg-info/dependency_links.txt
18
- h2p_parser/h2p_parser.egg-info/requires.txt
19
- h2p_parser/h2p_parser.egg-info/top_level.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/dependency_links.txt DELETED
@@ -1 +0,0 @@
1
-
 
 
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/requires.txt DELETED
@@ -1,2 +0,0 @@
1
- nltk
2
- inflect
 
 
 
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/top_level.txt DELETED
@@ -1 +0,0 @@
1
-
 
 
resources/app/python/xvapitch/text/h2p_parser/pos_parser.py DELETED
@@ -1,17 +0,0 @@
1
- # Part of Speech Tag Operations
2
-
3
- # Method to get the parent part of speech (VERB) or (NOUN) from a pos tag
4
- # from __future__ import annotations
5
-
6
- # def get_parent_pos(pos: str) -> str | None:
7
- def get_parent_pos(pos):
8
- # Get the parent part of speech from a pos tag
9
- if pos.startswith('VB'):
10
- return 'VERB'
11
- elif pos.startswith('NN'):
12
- return 'NOUN'
13
- elif pos.startswith('RB'):
14
- return 'ADVERB'
15
- else:
16
- return None
17
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/processors.py DELETED
@@ -1,392 +0,0 @@
1
- # Transformations of text sequences for matching
2
- from __future__ import annotations
3
- from typing import TYPE_CHECKING
4
- from .symbols import consonants
5
-
6
- import re
7
-
8
- if TYPE_CHECKING:
9
- from .cmudictext import CMUDictExt
10
-
11
- _re_digit = re.compile(r'\d+')
12
-
13
-
14
- class Processor:
15
- def __init__(self, cde: CMUDictExt):
16
- self._lookup = cde.lookup
17
- self._cmu_get = cde.dict.get
18
- self._segment = cde.segment
19
- self._tag = cde.h2p.tag
20
- self._stem = cde.stem
21
- # Number of times respective methods were called
22
- self.stat_hits = {
23
- 'plural': 0,
24
- 'possessives': 0,
25
- 'contractions': 0,
26
- 'hyphenated': 0,
27
- 'compound': 0,
28
- 'compound_l2': 0,
29
- 'stem': 0
30
- }
31
- # Number of times respective methods returned value (not None)
32
- self.stat_resolves = {
33
- 'plural': 0,
34
- 'possessives': 0,
35
- 'contractions': 0,
36
- 'hyphenated': 0,
37
- 'compound': 0,
38
- 'compound_l2': 0,
39
- 'stem': 0
40
- }
41
- # Holds events when features encountered unexpected language syntax
42
- self.stat_unexpected = {
43
- 'plural': [],
44
- 'possessives': [],
45
- 'contractions': [],
46
- 'hyphenated': [],
47
- 'compound': [],
48
- 'compound_l2': [],
49
- 'stem': []
50
- }
51
-
52
- def auto_possessives(self, word: str) -> str | None:
53
- """
54
- Auto-possessives
55
- :param word: Input of possible possessive word
56
- :return: Phoneme of word as SDS, or None if unresolvable
57
- """
58
- if not word.endswith("'s"):
59
- return None
60
- # If the word ends with "'s", register a hit
61
- self.stat_hits['possessives'] += 1
62
- """
63
- There are 3 general cases:
64
- 1. Base words ending in one of 6 special consonants (sibilants)
65
- - i.e. Tess's, Rose's, Butch's, Midge's, Rush's, Garage's
66
- - With consonants ending of [s], [z], [ch], [j], [sh], [zh]
67
- - In ARPAbet: {S}, {Z}, {CH}, {JH}, {SH}, {ZH}
68
- - These require a suffix of {IH0 Z}
69
- 2. Base words ending in vowels and voiced consonants:
70
- - i.e. Fay's, Hugh's, Bob's, Ted's, Meg's, Sam's, Dean's, Claire's, Paul's, Bing's
71
- - In ARPAbet: {IY0}, {EY1}, {UW1}, {B}, {D}, {G}, {M}, {N}, {R}, {L}, {NG}
72
- - Vowels need a wildcard match of any numbered variant
73
- - These require a suffix of {Z}
74
- 3. Base words ending in voiceless consonants:
75
- - i.e. Hope's, Pat's, Clark's, Ruth's
76
- - In ARPAbet: {P}, {T}, {K}, {TH}
77
- - These require a suffix of {S}
78
- """
79
-
80
- # Method to return phoneme and increment stat
81
- def _resolve(phoneme: str) -> str:
82
- self.stat_resolves['possessives'] += 1
83
- return phoneme
84
-
85
- core = word[:-2] # Get core word without possessive
86
- ph = self._lookup(core, ph_format='list') # find core word using recursive search
87
- if ph is None:
88
- return None # Core word not found
89
- # [Case 1]
90
- if ph[-1] in {'S', 'Z', 'CH', 'JH', 'SH', 'ZH'}:
91
- ph += 'IH0' + 'Z'
92
- return _resolve(ph)
93
- # [Case 2]
94
- """
95
- Valid for case 2:
96
- 'AA', 'AO', 'EY', 'OW', 'UW', 'AE', 'AW', 'EH', 'IH',
97
- 'OY', 'AH', 'AY', 'ER', 'IY', 'UH', 'UH',
98
- 'B', 'D', 'G', 'M', 'N', 'R', 'L', 'NG'
99
- To simplify matching, we will check for the listed single-letter variants and 'NG'
100
- and then check for any numbered variant
101
- """
102
- if ph[-1] in {'B', 'D', 'G', 'M', 'N', 'R', 'L', 'NG'} or ph[-1][-1].isdigit():
103
- ph += 'Z'
104
- return _resolve(ph)
105
- # [Case 3]
106
- if ph[-1] in ['P', 'T', 'K', 'TH']:
107
- ph += 'S'
108
- return _resolve(ph)
109
-
110
- return None # No match found
111
-
112
- def auto_contractions(self, word: str) -> str | None:
113
- """
114
- Auto contracts form and finds phonemes
115
- :param word:
116
- :return:
117
- """
118
- """
119
- Supported contractions:
120
- - 'll
121
- - 'd
122
- """
123
- # First, check if the word is a contraction
124
- parts = word.split("\'") # Split on [']
125
- if len(parts) == 1 or parts[1] not in {'ll', 'd'}:
126
- return None # No contraction found
127
- if len(parts) > 2:
128
- self.stat_unexpected['contraction'] += word
129
- return None # More than 2 parts, can't be a contraction
130
- # If initial check passes, register a hit
131
- self.stat_hits['contractions'] += 1
132
-
133
- # Get the core word
134
- core = parts[0]
135
- # Get the phoneme for the core word recursively
136
- ph = self._lookup(core, ph_format='list')
137
- if ph is None:
138
- return None # Core word not found
139
- # Add the phoneme with the appropriate suffix
140
- if parts[1] == 'll':
141
- ph += 'L'
142
- elif parts[1] == 'd':
143
- ph += 'D'
144
- # Return the phoneme
145
- self.stat_resolves['contractions'] += 1
146
- return ph
147
-
148
- def auto_hyphenated(self, word: str) -> str | None:
149
- """
150
- Splits hyphenated words and attempts to resolve components
151
- :param word:
152
- :return:
153
- """
154
- # First, check if the word is a hyphenated word
155
- if '-' not in word:
156
- return None # No hyphen found
157
- # If initial check passes, register a hit
158
- self.stat_hits['hyphenated'] += 1
159
- # Split the word into parts
160
- parts = word.split('-')
161
- # Get the phonemes for each part
162
- ph = []
163
- for part in parts:
164
- ph_part = self._lookup(part, ph_format='sds')
165
- if ph_part is None:
166
- return None # Part not found
167
- ph.append(ph_part)
168
- # Join the phonemes
169
- ph = ' '.join(ph)
170
- # Return the phoneme
171
- self.stat_resolves['hyphenated'] += 1
172
- return ph
173
-
174
- def auto_compound(self, word: str) -> str | None:
175
- """
176
- Splits compound words and attempts to resolve components
177
- :param word:
178
- :return:
179
- """
180
- # Split word into parts
181
- parts = self._segment(word)
182
- if len(parts) == 1:
183
- return None # No compound found
184
- # If initial check passes, register a hit
185
- self.stat_hits['compound'] += 1
186
- # Get the phonemes for each part
187
- ph = []
188
- for part in parts:
189
- ph_part = self._lookup(part, ph_format='sds')
190
- if ph_part is None:
191
- return None # Part not found
192
- ph.append(ph_part)
193
- # Join the phonemes
194
- ph = ' '.join(ph)
195
- # Return the phoneme
196
- self.stat_resolves['compound'] += 1
197
- return ph
198
-
199
- def auto_plural(self, word: str, pos: str = None) -> str | None:
200
- """
201
- Finds singular form of plurals and attempts to resolve separately
202
- Optionally a pos tag can be provided.
203
- If no tags are provided, there will be a single word pos inference,
204
- which is not ideal.
205
- :param pos:
206
- :param word:
207
- :return:
208
- """
209
- # First, check if the word is a replaceable plural
210
- # Needs to end in 's' or 'es'
211
- if word[-1] != 's':
212
- return None # No plural found
213
- # Now check if the word is a plural using pos
214
- if pos is None:
215
- pos = self._tag(word)
216
- if pos is None or len(pos) == 0 or (pos[0] != 'NNS' and pos[0] != 'NNPS'):
217
- return None # No tag found
218
- # If initial check passes, register a hit
219
- self.stat_hits['plural'] += 1
220
-
221
- """
222
- Case 1:
223
- > Word ends in 'oes'
224
- > Remove the 'es' to get the singular
225
- """
226
- if len(word) > 3 and word[-3:] == 'oes':
227
- singular = word[:-2]
228
- # Look up the possessive form (since the pronunciation is the same)
229
- ph = self.auto_possessives(singular + "'s")
230
- if ph is not None:
231
- self.stat_resolves['plural'] += 1
232
- return ph # Return the phoneme
233
-
234
- """
235
- Case 2:
236
- > Word ends in 's'
237
- > Remove the 's' to get the singular
238
- """
239
- if len(word) > 1 and word[-1] == 's':
240
- singular = word[:-1]
241
- # Look up the possessive form (since the pronunciation is the same)
242
- ph = self.auto_possessives(singular + "'s")
243
- if ph is not None:
244
- self.stat_resolves['plural'] += 1
245
- return ph # Return the phoneme
246
-
247
- # If no matches, return None
248
- return None
249
-
250
- def auto_stem(self, word: str) -> str | None:
251
- """
252
- Attempts to resolve using the root stem of a word.
253
- Supported modes:
254
- - "ing"
255
- - "ingly"
256
- - "ly"
257
- :param word:
258
- :return:
259
- """
260
-
261
- # noinspection SpellCheckingInspection
262
- """
263
- 'ly' has no special rules, always add phoneme 'L IY0'
264
-
265
- 'ing' relevant rules:
266
-
267
- > If the original verb ended in [e], remove it and add [ing]
268
- - i.e. take -> taking, make -> making
269
- - We will search once with the original verb, and once with [e] added
270
- - 1st attempt: tak, mak
271
- - 2nd attempt: take, make
272
-
273
- > If the input word has a repeated consonant before [ing], it's likely that
274
- the original verb has only 1 of the consonants
275
- - i.e. running -> run, stopping -> stop
276
- - We will search for repeated consonants, and perform 2 attempts:
277
- - 1st attempt: without the repeated consonant (run, stop)
278
- - 2nd attempt: with the repeated consonant (runn, stopp)
279
- """
280
- # Discontinue if word is too short
281
- if len(word) < 3 or (not word.endswith('ly') and not word.endswith('ing')):
282
- return None
283
- # Register a hit
284
- self.stat_hits['stem'] += 1 # Register hit
285
-
286
- # For ly case
287
- if word.endswith('ly'):
288
- # Get the root word
289
- root = word[:-2]
290
- # Recursively get the root
291
- ph_root = self._lookup(root, ph_format='sds')
292
- # If not exist, return None
293
- if ph_root is None:
294
- return None
295
- ph_ly = 'L IY0'
296
- ph_joined = ' '.join([ph_root, ph_ly])
297
- self.stat_resolves['stem'] += 1
298
- return ph_joined
299
-
300
- # For ing case
301
- if word.endswith('ing'):
302
- # Get the root word
303
- root = word[:-3]
304
- # Recursively get the root
305
- ph_root = self._lookup(root, ph_format='sds')
306
- # If not exist, return None
307
- if ph_root is None:
308
- return None
309
- ph_ly = 'IH0 NG'
310
- ph_joined = ' '.join([ph_root, ph_ly])
311
- self.stat_resolves['stem'] += 1
312
- return ph_joined
313
-
314
- def auto_component(self, word: str) -> str | None:
315
- """
316
- Searches for target word as component of a larger word
317
- :param word:
318
- :return:
319
- """
320
-
321
- """
322
- This processing step checks for words as a component of a larger word
323
- - i.e. 'synth' is not in the cmu dictionary
324
- - Stage 1: We will search for any word beginning with 'synth' (10 matches)
325
- - This is because most unseen short words are likely shortened versions
326
- - We will split
327
- - Stage 2: Search for any word containing 'synth' (13 matches)
328
-
329
- """
330
- raise NotImplementedError
331
-
332
- def auto_compound_l2(self, word: str, recursive: bool = True) -> str | None:
333
- """
334
- Searches for target word as a compound word.
335
- > Does not use n-gram splitting like auto_compound()
336
- > Splits words manually into every possible combination
337
- > Returns the match with the highest length of both words
338
- :param recursive: True to enable recursive lookups, otherwise only use base CMU dictionary
339
- :param word:
340
- :return:
341
- """
342
- # Word must be fully alphabetic
343
- if not word.isalpha() or len(word) < 3:
344
- return None
345
- self.stat_hits['compound_l2'] += 1 # Register hit
346
-
347
- # Define lookup mode
348
- def _lu(search_word: str) -> str | None:
349
- if recursive:
350
- return self._lookup(search_word, ph_format='sds')
351
- else:
352
- return self._cmu_get(search_word)
353
-
354
- # Check if the last part is a single character
355
- # And that it is repeated in the last char of the first part
356
- # This is likely silent so remove it
357
- # i.e. 'Derakk' -> 'Derak'
358
- # If the word contains a repeated consonant at the end, remove it
359
- # First check repeated last 2 letters
360
- if word[-2:][0] == word[-2:][1]:
361
- # Remove the last char from the word
362
- word = word[:-1]
363
-
364
- # Holds all matches as tuples
365
- # (len1, len2, p1, p2, ph1, ph2)
366
- matches = []
367
-
368
- # Splits the word into every possible combination
369
- for i in range(1, len(word)):
370
- p1 = word[:i]
371
- p2 = word[i:]
372
- # Looks up both words
373
- ph1 = _lu(p1)
374
- if ph1 is None:
375
- continue # Skip if not found
376
- ph2 = _lu(p2)
377
- if ph2 is None:
378
- continue # Skip if not found
379
- # If both words exist, add to list as tuple
380
- matches.append((len(p1), len(p2), p1, p2, ph1, ph2))
381
-
382
- # Pick the match with the highest length of both words
383
- if len(matches) == 0:
384
- return None
385
- else:
386
- # Sort by the minimum of len1 and len2
387
- matches.sort(key=lambda x: min(x[0], x[1]))
388
- # Get the highest minimum length match
389
- match = matches[-1]
390
- # Otherwise, return the full joined match
391
- self.stat_resolves['compound_l2'] += 1 # Register resolve
392
- return match[4] + ' ' + match[5]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/symbols.py DELETED
@@ -1,82 +0,0 @@
1
- # Holds symbols for graphemes, phonemes, and pos-tags.
2
- # noinspection SpellCheckingInspection,GrazieInspection
3
- """
4
- POS tag list:
5
-
6
- CC coordinating conjunction
7
- CD cardinal digit
8
- DT determiner
9
- EX existential there ("there is" -> "there exists")
10
- FW foreign word
11
- IN preposition/subordinating conjunction
12
- JJ adjective ('big')
13
- JJR adjective, comparative ('bigger')
14
- JJS adjective, superlative ('biggest')
15
- LS list marker ("1)", "2)", "3)")
16
- MD modal ('could', 'will')
17
- NN noun, singular
18
- NNS noun plural
19
- NNP proper noun, singular 'Harrison'
20
- NNPS proper noun, plural 'Americans'
21
- PDT predeterminer ('all' in 'all the kids')
22
- POS possessive ending (parent's)
23
- PRP personal pronoun (I, he, she)
24
- PRP$ possessive pronoun (my, his, hers)
25
- RB adverb ('very', 'silently')
26
- RBR adverb, comparative ('better')
27
- RBS adverb, superlative ('best')
28
- RP particle ('give up')
29
- TO to ("go 'to' the store.")
30
- UH interjection ("errrrrrrrm")
31
- VB verb, base form take
32
- VBD verb, past tense took
33
- VBG verb, gerund/present participle taking
34
- VBN verb, past participle taken
35
- VBP verb, sing. present, non-3d take
36
- VBZ verb, 3rd person sing. present takes
37
- WDT wh-determiner which
38
- WP wh-pronoun who, what
39
- WP$ possessive wh-pronoun whose
40
- WRB wh-abverb where, when
41
- """
42
-
43
- from __future__ import annotations
44
-
45
- # noinspection SpellCheckingInspection,GrazieInspection
46
- graphemes = list("abcdefghijklmnopqrstuvwxyz")
47
- phonemes = ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
48
- 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
49
- 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH',
50
- 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
51
- 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
52
- 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
53
- pos_tags = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS',
54
- 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH',
55
- 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
56
- pos_type_tags = ['VERB', 'NOUN', 'PRON', 'ADJ', 'ADV']
57
- pos_type_short_tags = ['V', 'N', 'P', 'A', 'R']
58
- pos_type_form_dict = {'V': 'VERB', 'N': 'NOUN', 'P': 'PRON', 'A': 'ADJ', 'R': 'ADV'}
59
- graphemes_set = set(graphemes)
60
- phonemes_set = set(phonemes)
61
- pos_tags_set = set(pos_tags)
62
- pos_type_tags_set = set(pos_type_tags)
63
- pos_type_short_tags_set = set(pos_type_short_tags)
64
- punctuation = {'.', ',', ':', ';', '?', '!', '-', '_', '\'', '\"', '`', '~', '@', '#', '$'}
65
- consonants = {'B', 'CH', 'D', 'DH', 'F', 'G', 'HH', 'JH', 'K', 'L', 'M', 'N', 'NG', 'P', 'R',
66
- 'S', 'SH', 'T', 'TH', 'V', 'W', 'Y', 'Z', 'ZH'}
67
-
68
-
69
- # Method to convert from short type tags to full type tags.
70
- def to_full_type_tag(short_type_tag: str) -> str | None:
71
- if short_type_tag == 'V':
72
- return 'VERB'
73
- elif short_type_tag == 'N':
74
- return 'NOUN'
75
- elif short_type_tag == 'P':
76
- return 'PRON'
77
- elif short_type_tag == 'A':
78
- return 'ADJ'
79
- elif short_type_tag == 'R':
80
- return 'ADV'
81
- else:
82
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/text/__init__.py DELETED
File without changes
resources/app/python/xvapitch/text/h2p_parser/text/numbers.py DELETED
@@ -1,166 +0,0 @@
1
- # Provides parsing of numbers to text
2
- """
3
- This module provides parsing of numeric types in English to text.
4
- Modified from https://github.com/keithito/tacotron
5
- """
6
-
7
- import inflect
8
- import re
9
-
10
- _magnitudes = ['trillion', 'billion', 'million', 'thousand', 'hundred', 'm', 'b', 't']
11
- _magnitudes_key = {'m': 'million', 'b': 'billion', 't': 'trillion'}
12
- _measurements = '(f|c|k|d|m|km|ft)'
13
- _measurements_key = {'f': 'fahrenheit',
14
- 'c': 'celsius',
15
- 'k': 'thousand',
16
- 'm': 'meters',
17
- 'km': 'kilometers',
18
- 'ft': 'feet'}
19
- _currency_key = {'$': 'dollar', 'Β£': 'pound', '€': 'euro', 'β‚©': 'won'}
20
- _inflect = inflect.engine()
21
- _comma_number_re = re.compile(r'([0-9][0-9,]+[0-9])')
22
- _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
23
- _currency_re = re.compile(r'([$€£₩])([0-9.,]*[0-9]+)(?:[ ]?({})(?=[^a-zA-Z]|$))?'.format("|".join(_magnitudes)),
24
- re.IGNORECASE)
25
- _measurement_re = re.compile(r'([0-9.,]*[0-9]+(\s)?{}\b)'.format(_measurements), re.IGNORECASE)
26
- _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
27
- _range_re = re.compile(r'(?<=[0-9])+(-)(?=[0-9])+.*?')
28
- _roman_re = re.compile(r'\b(?=[MDCLXVI]+\b)M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{2,3})\b') # avoid I
29
- _multiply_re = re.compile(r'(\b[0-9]+)(x)([0-9]+)')
30
- _number_re = re.compile(r"[0-9]+'s|[0-9]+s|[0-9]+")
31
-
32
-
33
- def _remove_commas(m):
34
- return m.group(1).replace(',', '')
35
-
36
-
37
- def _expand_decimal_point(m):
38
- return m.group(1).replace('.', ' point ')
39
-
40
-
41
- def _expand_currency(m):
42
- currency = _currency_key[m.group(1)]
43
- quantity = m.group(2)
44
- magnitude = m.group(3)
45
-
46
- # remove commas from quantity to be able to convert to numerical
47
- quantity = quantity.replace(',', '')
48
-
49
- # check for million, billion, etc...
50
- if magnitude is not None and magnitude.lower() in _magnitudes:
51
- if len(magnitude) == 1:
52
- magnitude = _magnitudes_key[magnitude.lower()]
53
- return "{} {} {}".format(_expand_hundreds(quantity), magnitude, currency + 's')
54
-
55
- parts = quantity.split('.')
56
- if len(parts) > 2:
57
- return quantity + " " + currency + "s" # Unexpected format
58
-
59
- dollars = int(parts[0]) if parts[0] else 0
60
-
61
- cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
62
- if dollars and cents:
63
- dollar_unit = currency if dollars == 1 else currency + 's'
64
- cent_unit = 'cent' if cents == 1 else 'cents'
65
- return "{} {}, {} {}".format(
66
- _expand_hundreds(dollars), dollar_unit,
67
- _inflect.number_to_words(cents), cent_unit)
68
- elif dollars:
69
- dollar_unit = currency if dollars == 1 else currency + 's'
70
- return "{} {}".format(_expand_hundreds(dollars), dollar_unit)
71
- elif cents:
72
- cent_unit = 'cent' if cents == 1 else 'cents'
73
- return "{} {}".format(_inflect.number_to_words(cents), cent_unit)
74
- else:
75
- return 'zero' + ' ' + currency + 's'
76
-
77
-
78
- def _expand_hundreds(text):
79
- number = float(text)
80
- if 1000 < number < 10000 and (number % 100 == 0) and (number % 1000 != 0):
81
- return _inflect.number_to_words(int(number / 100)) + " hundred"
82
- else:
83
- return _inflect.number_to_words(text)
84
-
85
-
86
- def _expand_ordinal(m):
87
- return _inflect.number_to_words(m.group(0))
88
-
89
-
90
- def _expand_measurement(m):
91
- _, number, measurement = re.split(r'(\d+(?:\.\d+)?)', m.group(0))
92
- number = _inflect.number_to_words(number)
93
- measurement = "".join(measurement.split())
94
- measurement = _measurements_key[measurement.lower()]
95
- # if measurement is plural, and number is singular, remove the 's'
96
- if number == "one" and str.endswith(measurement, "s"):
97
- # Remove the 's' from the end of the measurement
98
- measurement = measurement[:-1]
99
- return "{} {}".format(number, measurement)
100
-
101
-
102
- def _expand_range(m):
103
- return ' to '
104
-
105
-
106
- def _expand_multiply(m):
107
- left = m.group(1)
108
- right = m.group(3)
109
- return "{} by {}".format(left, right)
110
-
111
-
112
- def _expand_roman(m):
113
- # from https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python
114
- roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
115
- result = 0
116
- num = m.group(0)
117
- for i, c in enumerate(num):
118
- if (i + 1) == len(num) or roman_numerals[c] >= roman_numerals[num[i + 1]]:
119
- result += roman_numerals[c]
120
- else:
121
- result -= roman_numerals[c]
122
- return str(result)
123
-
124
-
125
- def _expand_number(m):
126
- _, number, suffix = re.split(r"(\d+(?:'?\d+)?)", m.group(0))
127
- number = int(number)
128
- if number > 1000 < 10000 and (number % 100 == 0) and (number % 1000 != 0):
129
- text = _inflect.number_to_words(number // 100) + " hundred"
130
- elif 1000 < number < 3000:
131
- if number == 2000:
132
- text = 'two thousand'
133
- elif 2000 < number < 2010:
134
- text = 'two thousand ' + _inflect.number_to_words(number % 100)
135
- elif number % 100 == 0:
136
- text = _inflect.number_to_words(number // 100) + ' hundred'
137
- else:
138
- number = _inflect.number_to_words(number, andword='', zero='oh', group=2).replace(', ', ' ')
139
- number = re.sub(r'-', ' ', number)
140
- text = number
141
- else:
142
- number = _inflect.number_to_words(number, andword='and')
143
- number = re.sub(r'-', ' ', number)
144
- number = re.sub(r',', '', number)
145
- text = number
146
-
147
- if suffix in ("'s", "s"):
148
- if text[-1] == 'y':
149
- text = text[:-1] + 'ies'
150
- else:
151
- text = text + suffix
152
-
153
- return text
154
-
155
-
156
- def normalize_numbers(text):
157
- text = re.sub(_comma_number_re, _remove_commas, text)
158
- text = re.sub(_currency_re, _expand_currency, text)
159
- text = re.sub(_decimal_number_re, _expand_decimal_point, text)
160
- text = re.sub(_ordinal_re, _expand_ordinal, text)
161
- # text = re.sub(_range_re, _expand_range, text)
162
- text = re.sub(_measurement_re, _expand_measurement, text)
163
- text = re.sub(_roman_re, _expand_roman, text)
164
- text = re.sub(_multiply_re, _expand_multiply, text)
165
- text = re.sub(_number_re, _expand_number, text)
166
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/utils/__init__.py DELETED
File without changes
resources/app/python/xvapitch/text/h2p_parser/utils/converter.py DELETED
@@ -1,79 +0,0 @@
1
- # Converts dictionary files
2
- import json
3
- import os
4
-
5
- from .. import symbols
6
- from .. import format_ph as ph
7
- from tqdm import tqdm
8
-
9
-
10
- def from_binary_delim(path, delimiter) -> dict:
11
- # Converts a delimited binary state heteronym look-up dictionary to a dict format
12
- # Expected format: WORD|(Space Seperated Phonemes Case)|(Space Seperated Phonemes Default)|(Case)
13
- # Example: "REJECT|R IH0 JH EH1 K T|R IY1 JH EH0 K T|V"
14
- # Hashtag comments are allowed but only at the start of a file
15
-
16
- # Import file
17
- result_dict = {}
18
- num_lines = sum(1 for line in open(path, 'r'))
19
- with open(path, 'r') as f:
20
- skipped_comments = False
21
- for line in tqdm(f, total=num_lines):
22
- # Skip comments
23
- if not skipped_comments:
24
- if line.startswith('#') or line == '\n':
25
- continue
26
- else:
27
- skipped_comments = True
28
- # Skip empty or newline lines
29
- if line.strip() == '' or line.strip() == '\n':
30
- continue
31
- # Parse line using passed delimiter
32
- tokens = line.strip().split(delimiter)
33
- # Check for correct number of tokens
34
- if len(tokens) != 4:
35
- raise ValueError('Invalid number of tokens in line: ' + line)
36
- # Get word (token 0) and check validity (no spaces)
37
- word = tokens[0].lower()
38
- if ' ' in word:
39
- raise ValueError('Invalid word in line: ' + line)
40
- # Get phonemes and check validity (alphanumeric)
41
- ph_case = tokens[1]
42
- ph_default = tokens[2]
43
- if not ph_case.replace(' ', '').isalnum() or not ph_default.replace(' ', '').isalnum():
44
- raise ValueError('Invalid phonemes in line: ' + line)
45
- # Get case (token 3) and check validity (alphanumeric)
46
- case = tokens[3]
47
- if not case.isalnum():
48
- raise ValueError('Invalid case in line: ' + line)
49
- # Check if case is a full case or full type case
50
- if case in symbols.pos_tags_set or case in symbols.pos_type_tags_set:
51
- # Add to dictionary directly
52
- # Build sub-dictionary for each case
53
- sub_dict = result_dict.get(word, {})
54
- sub_dict[case] = ph.to_sds(ph_case)
55
- sub_dict['DEFAULT'] = ph.to_sds(ph_default)
56
- result_dict[word] = sub_dict
57
- # Check if case is a short type case
58
- elif case in symbols.pos_type_short_tags_set:
59
- # Need to convert to full type case
60
- sub_dict = result_dict.get(word, {})
61
- case_short = symbols.pos_type_form_dict[case]
62
- sub_dict[case_short] = ph.to_sds(ph_case)
63
- sub_dict['DEFAULT'] = ph.to_sds(ph_default)
64
- result_dict[word] = sub_dict
65
- else:
66
- raise ValueError('Invalid case in line: ' + line)
67
- return result_dict
68
-
69
-
70
- # Method to write a dict to a json file
71
- def to_json(path, dict_to_write):
72
- # Writes a dictionary to a json file
73
- with open(path, 'w') as f:
74
- json.dump(dict_to_write, f, indent=4, sort_keys=True)
75
-
76
-
77
- # Combined method to convert binary delimited files to json
78
- def bin_delim_to_json(path, output_path, delimiter):
79
- to_json(output_path, from_binary_delim(path, delimiter))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/app/python/xvapitch/text/h2p_parser/utils/parser.py DELETED
@@ -1,133 +0,0 @@
1
- # Parses annotation files for conversion of sentences to phonemes
2
- from __future__ import annotations
3
- from h2p_parser import cmudictext
4
- from h2p_parser.filter import filter_text
5
- from h2p_parser.text.numbers import normalize_numbers
6
- from h2p_parser.symbols import punctuation
7
-
8
- # Reads a file into a list of lines
9
- from tqdm import tqdm
10
-
11
-
12
- def read_file(file_name, delimiter) -> list:
13
- with open(file_name, 'r', encoding="utf-8") as f:
14
- result = []
15
- for line in f:
16
- line = line.split(delimiter)
17
- # Take the second element
18
- result.append(line[1].lower())
19
- return result
20
-
21
- # Method that checks if a single line is resolvable
22
-
23
-
24
- # Checks a list of lines for unresolvable words
25
- # Returns a list of lines with unresolvable words, or None if no unresolvable words
26
- def check_lines(lines: list) -> ParseResult:
27
- cde = cmudictext.CMUDictExt()
28
- # Holds result
29
- result = ParseResult()
30
- # Loop with nqdm
31
- for line in tqdm(lines, desc='Checking lines'):
32
- # Add
33
- result.all_lines.append(line)
34
- result.lines.add(line)
35
- # If line contains het, add to result
36
- if cde.h2p.contains_het(line):
37
- result.all_lines_cont_het.append(line)
38
- # Filter the line
39
- f_line = filter_text(line)
40
- # Number converter
41
- f_line = normalize_numbers(f_line)
42
- # Tokenize
43
- tokens = cde.h2p.tokenize(f_line)
44
- for word in tokens:
45
- # Skip word if punctuation
46
- if word in punctuation:
47
- continue
48
- # Add word to result
49
- result.all_words.append(word)
50
- result.words.add(word)
51
- # Check if word is resolvable
52
- h2p_res = cde.h2p.contains_het(word)
53
- cmu_res = cde.dict.get(word) is not None
54
- fet_res = cde.lookup(word) is not None
55
- if not h2p_res and not cmu_res and not fet_res:
56
- # If word ends in "'s", remove it and add the base word
57
- if word.endswith("'s"):
58
- word = word[:-2]
59
- result.unres_all_lines.append(line)
60
- result.unres_all_words.append(word)
61
- result.unres_lines.add(line)
62
- result.unres_words.add(word)
63
- elif h2p_res:
64
- result.n_words_res += 1
65
- result.n_words_het += 1
66
- elif cmu_res:
67
- result.n_words_res += 1
68
- result.n_words_cmu += 1
69
- elif fet_res:
70
- result.n_words_res += 1
71
- result.n_words_fet += 1
72
-
73
- # Also pass stats
74
- result.ft_stats = cde.p.stat_resolves
75
-
76
- return result
77
-
78
-
79
- # Class to hold the result of a parse
80
- class ParseResult:
81
- def __init__(self):
82
- self.all_lines = []
83
- self.all_lines_cont_het = []
84
- self.unres_all_lines = []
85
- self.lines = set()
86
- self.unres_lines = set()
87
- # Words
88
- self.all_words = []
89
- self.unres_all_words = []
90
- self.words = set()
91
- self.unres_words = set()
92
- # Numerical stats
93
- self.n_words_res = 0 # Number of total resolved words
94
- self.n_words_cmu = 0 # Resolved words from CMU
95
- self.n_words_fet = 0 # Resolved words from Features
96
- self.n_words_het = 0 # Resolved words from H2p
97
- # Stats from cmudictext
98
- self.ft_stats = None
99
-
100
- # Get percentage of lines covered
101
- def line_unique_coverage(self) -> float:
102
- dec = 1 - len(self.unres_lines) / len(self.lines)
103
- return round(dec * 100, 2)
104
-
105
- # Get percentage of words covered
106
- def word_unique_coverage(self) -> float:
107
- dec = 1 - len(self.unres_words) / len(self.words)
108
- return round(dec * 100, 2)
109
-
110
- # Get percentage of lines covered (All)
111
- def line_coverage(self) -> float:
112
- dec = 1 - len(self.unres_all_lines) / len(self.all_lines)
113
- return round(dec * 100, 2)
114
-
115
- # Get percentage of words covered (All)
116
- def word_coverage(self) -> float:
117
- dec = 1 - len(self.unres_all_words) / len(self.all_words)
118
- return round(dec * 100, 2)
119
-
120
- # Get percentage of heteronyms containing lines
121
- def percent_line_het(self) -> float:
122
- dec = len(self.all_lines_cont_het) / len(self.all_lines)
123
- return round(dec * 100, 2)
124
-
125
- # Get percentage of words resolved by H2p
126
- def percent_word_h2p(self) -> float:
127
- dec = self.n_words_het / self.n_words_res
128
- return round(dec * 100, 2)
129
-
130
- # Get percentage of words resolved by CMU
131
- def percent_word_cmu(self) -> float:
132
- dec = self.n_words_cmu / self.n_words_res
133
- return round(dec * 100, 2)