jamesebond commited on
Commit
c4039ba
1 Parent(s): 38eec76

Upload 5 files

Browse files
Files changed (5) hide show
  1. model.py +491 -0
  2. pinyin.txt +408 -0
  3. support_language.json +210 -0
  4. tokenizer.json +2 -2
  5. tokenizer_config.json +1 -1
model.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
2
+ import torch
3
+ from modules.file import ExcelFileWriter
4
+ import os
5
+
6
+ from abc import ABC, abstractmethod
7
+ from typing import List
8
+ import re
9
+
10
+ class FilterPipeline():
11
+ def __init__(self, filter_list):
12
+ self._filter_list:List[Filter] = filter_list
13
+
14
+ def append(self, filter):
15
+ self._filter_list.append(filter)
16
+
17
+ def batch_encoder(self, inputs):
18
+ for filter in self._filter_list:
19
+ inputs = filter.encoder(inputs)
20
+ return inputs
21
+
22
+ def batch_decoder(self, inputs):
23
+ for filter in reversed(self._filter_list):
24
+ inputs = filter.decoder(inputs)
25
+ return inputs
26
+
27
+ class Filter(ABC):
28
+ def __init__(self):
29
+ self.name = 'filter'
30
+ self.code = []
31
+ @abstractmethod
32
+ def encoder(self, inputs):
33
+ pass
34
+
35
+ @abstractmethod
36
+ def decoder(self, inputs):
37
+ pass
38
+
39
+ class SpecialTokenFilter(Filter):
40
+ def __init__(self):
41
+ self.name = 'special token filter'
42
+ self.code = []
43
+ self.special_tokens = ['!', '!', '-']
44
+
45
+ def encoder(self, inputs):
46
+ filtered_inputs = []
47
+ self.code = []
48
+ for i, input_str in enumerate(inputs):
49
+ if not all(char in self.special_tokens for char in input_str):
50
+ filtered_inputs.append(input_str)
51
+ else:
52
+ self.code.append([i, input_str])
53
+ return filtered_inputs
54
+
55
+ def decoder(self, inputs):
56
+ original_inputs = inputs.copy()
57
+ for removed_indice in self.code:
58
+ original_inputs.insert(removed_indice[0], removed_indice[1])
59
+ return original_inputs
60
+
61
+ class SperSignFilter(Filter):
62
+ def __init__(self):
63
+ self.name = 's percentage sign filter'
64
+ self.code = []
65
+
66
+ def encoder(self, inputs):
67
+ encoded_inputs = []
68
+ self.code = [] # 清空 self.code
69
+ for i, input_str in enumerate(inputs):
70
+ if '%s' in input_str:
71
+ encoded_str = input_str.replace('%s', '*')
72
+ self.code.append(i) # 将包含 '%s' 的字符串的索引存储到 self.code 中
73
+ else:
74
+ encoded_str = input_str
75
+ encoded_inputs.append(encoded_str)
76
+ return encoded_inputs
77
+
78
+ def decoder(self, inputs):
79
+ decoded_inputs = inputs.copy()
80
+ for i in self.code:
81
+ decoded_inputs[i] = decoded_inputs[i].replace('*', '%s') # 使用 self.code 中的索引还原原始字符串
82
+ return decoded_inputs
83
+
84
+ class ParenSParenFilter(Filter):
85
+ def __init__(self):
86
+ self.name = 'Paren s paren filter'
87
+ self.code = []
88
+
89
+ def encoder(self, inputs):
90
+ encoded_inputs = []
91
+ self.code = [] # 清空 self.code
92
+ for i, input_str in enumerate(inputs):
93
+ if '(s)' in input_str:
94
+ encoded_str = input_str.replace('(s)', '$')
95
+ self.code.append(i) # 将包含 '(s)' 的字符串的索引存储到 self.code 中
96
+ else:
97
+ encoded_str = input_str
98
+ encoded_inputs.append(encoded_str)
99
+ return encoded_inputs
100
+
101
+ def decoder(self, inputs):
102
+ decoded_inputs = inputs.copy()
103
+ for i in self.code:
104
+ decoded_inputs[i] = decoded_inputs[i].replace('$', '(s)') # 使用 self.code 中的索引还原原始字符串
105
+ return decoded_inputs
106
+
107
+ class ChevronsFilter(Filter):
108
+ def __init__(self):
109
+ self.name = 'chevrons filter'
110
+ self.code = []
111
+
112
+ def encoder(self, inputs):
113
+ encoded_inputs = []
114
+ self.code = [] # 清空 self.code
115
+ pattern = re.compile(r'<.*?>')
116
+ for i, input_str in enumerate(inputs):
117
+ if pattern.search(input_str):
118
+ matches = pattern.findall(input_str)
119
+ encoded_str = pattern.sub('#', input_str)
120
+ self.code.append((i, matches)) # 将包含匹配模式的字符串的索引和匹配列表存储到 self.code 中
121
+ else:
122
+ encoded_str = input_str
123
+ encoded_inputs.append(encoded_str)
124
+ return encoded_inputs
125
+
126
+ def decoder(self, inputs):
127
+ decoded_inputs = inputs.copy()
128
+ for i, matches in self.code:
129
+ for match in matches:
130
+ decoded_inputs[i] = decoded_inputs[i].replace('#', match, 1) # 使用 self.code 中的匹配列表依次还原原始字符串
131
+ return decoded_inputs
132
+
133
+ class SimilarFilter(Filter):
134
+ def __init__(self):
135
+ self.name = 'similar filter'
136
+ self.code = []
137
+
138
+ def is_similar(self, str1, str2):
139
+ # 判断两个字符串是否相似(只有数字上有区别)
140
+ pattern = re.compile(r'\d+')
141
+ return pattern.sub('', str1) == pattern.sub('', str2)
142
+
143
+ def encoder(self, inputs):
144
+ encoded_inputs = []
145
+ self.code = [] # 清空 self.code
146
+ i = 0
147
+ while i < len(inputs):
148
+ encoded_inputs.append(inputs[i])
149
+ similar_strs = [inputs[i]]
150
+ j = i + 1
151
+ while j < len(inputs) and self.is_similar(inputs[i], inputs[j]):
152
+ similar_strs.append(inputs[j])
153
+ j += 1
154
+ if len(similar_strs) > 1:
155
+ self.code.append((i, similar_strs)) # 将相似字符串的起始索引和实际字符串列表存储到 self.code 中
156
+ i = j
157
+ return encoded_inputs
158
+
159
+ def decoder(self, inputs:List):
160
+ decoded_inputs = inputs
161
+ for i, similar_strs in self.code:
162
+ pattern = re.compile(r'\d+')
163
+ for j in range(len(similar_strs)):
164
+ if pattern.search(similar_strs[j]):
165
+ number = re.findall(r'\d+', similar_strs[j])[0] # 获取相似字符串的数字部分
166
+ new_str = pattern.sub(number, inputs[i]) # 将新字符串的数字部分替换为相似字符串的数字部分
167
+ else:
168
+ new_str = inputs[i] # 如果相似字符串不含数字,直接使用新字符串
169
+ if j > 0:
170
+ decoded_inputs.insert(i+j, new_str)
171
+ return decoded_inputs
172
+
173
+ class ChineseFilter:
174
+ def __init__(self, pinyin_lib_file='pinyin.txt'):
175
+ self.name = 'chinese filter'
176
+ self.code = []
177
+ self.pinyin_lib = self.load_pinyin_lib(pinyin_lib_file)
178
+
179
+ def load_pinyin_lib(self, file_path):
180
+ with open(os.path.join(script_dir,file_path), 'r', encoding='utf-8') as f:
181
+ return set(line.strip().lower() for line in f)
182
+
183
+ def is_valid_chinese(self, word):
184
+ # 判断一个单词是否符合要求:只有一个单词构成,并且首字母大写
185
+ if len(word.split()) == 1 and word[0].isupper():
186
+ # 使用pinyin_or_word函数判断是否是合法的拼音
187
+ return self.is_pinyin(word.lower())
188
+ return False
189
+
190
+ def encoder(self, inputs):
191
+ encoded_inputs = []
192
+ self.code = [] # 清空 self.code
193
+ for i, word in enumerate(inputs):
194
+ if self.is_valid_chinese(word):
195
+ self.code.append((i, word)) # 将需要过滤的中文单词的索引和拼音存储到 self.code 中
196
+ else:
197
+ encoded_inputs.append(word)
198
+ return encoded_inputs
199
+
200
+ def decoder(self, inputs):
201
+ decoded_inputs = inputs.copy()
202
+ for i, word in self.code:
203
+ decoded_inputs.insert(i, word) # 根据索引将过滤的中文单词还原到原位置
204
+ return decoded_inputs
205
+
206
+ def is_pinyin(self, string):
207
+ '''
208
+ judge a string is a pinyin or a english word.
209
+ pinyin_Lib comes from a txt file.
210
+ '''
211
+ string = string.lower()
212
+ stringlen = len(string)
213
+ max_len = 6
214
+ result = []
215
+ n = 0
216
+ while n < stringlen:
217
+ matched = 0
218
+ temp_result = []
219
+ for i in range(max_len, 0, -1):
220
+ s = string[0:i]
221
+ if s in self.pinyin_lib:
222
+ temp_result.append(string[:i])
223
+ matched = i
224
+ break
225
+ if i == 1 and len(temp_result) == 0:
226
+ return False
227
+ result.extend(temp_result)
228
+ string = string[matched:]
229
+ n += matched
230
+ return True
231
+
232
+ script_dir = os.path.dirname(os.path.abspath(__file__))
233
+ parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
234
+
235
+ class Model():
236
+ def __init__(self, modelname, selected_lora_model, selected_gpu):
237
+ def get_gpu_index(gpu_info, target_gpu_name):
238
+ """
239
+ 从 GPU 信息中获取目标 GPU 的索引
240
+ Args:
241
+ gpu_info (list): 包含 GPU 名称的列表
242
+ target_gpu_name (str): 目标 GPU 的名称
243
+
244
+ Returns:
245
+ int: 目标 GPU 的索引,如果未找到则返回 -1
246
+ """
247
+ for i, name in enumerate(gpu_info):
248
+ if target_gpu_name.lower() in name.lower():
249
+ return i
250
+ return -1
251
+ if selected_gpu != "cpu":
252
+ gpu_count = torch.cuda.device_count()
253
+ gpu_info = [torch.cuda.get_device_name(i) for i in range(gpu_count)]
254
+ selected_gpu_index = get_gpu_index(gpu_info, selected_gpu)
255
+ self.device_name = f"cuda:{selected_gpu_index}"
256
+ else:
257
+ self.device_name = "cpu"
258
+ print("device_name", self.device_name)
259
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(modelname).to(self.device_name)
260
+ self.tokenizer = AutoTokenizer.from_pretrained(modelname)
261
+ # self.translator = pipeline('translation', model=self.original_model, tokenizer=self.tokenizer, src_lang=original_language, tgt_lang=target_language, device=device)
262
+
263
+ def generate(self, inputs, original_language, target_languages, max_batch_size):
264
+ filter_list = [SpecialTokenFilter(), SperSignFilter(), ParenSParenFilter(), ChevronsFilter(), SimilarFilter(), ChineseFilter()]
265
+ filter_pipeline = FilterPipeline(filter_list)
266
+ def language_mapping(original_language):
267
+ d = {
268
+ "Achinese (Arabic script)": "ace_Arab",
269
+ "Achinese (Latin script)": "ace_Latn",
270
+ "Mesopotamian Arabic": "acm_Arab",
271
+ "Ta'izzi-Adeni Arabic": "acq_Arab",
272
+ "Tunisian Arabic": "aeb_Arab",
273
+ "Afrikaans": "afr_Latn",
274
+ "South Levantine Arabic": "ajp_Arab",
275
+ "Akan": "aka_Latn",
276
+ "Amharic": "amh_Ethi",
277
+ "North Levantine Arabic": "apc_Arab",
278
+ "Standard Arabic": "arb_Arab",
279
+ "Najdi Arabic": "ars_Arab",
280
+ "Moroccan Arabic": "ary_Arab",
281
+ "Egyptian Arabic": "arz_Arab",
282
+ "Assamese": "asm_Beng",
283
+ "Asturian": "ast_Latn",
284
+ "Awadhi": "awa_Deva",
285
+ "Central Aymara": "ayr_Latn",
286
+ "South Azerbaijani": "azb_Arab",
287
+ "North Azerbaijani": "azj_Latn",
288
+ "Bashkir": "bak_Cyrl",
289
+ "Bambara": "bam_Latn",
290
+ "Balinese": "ban_Latn",
291
+ "Belarusian": "bel_Cyrl",
292
+ "Bemba": "bem_Latn",
293
+ "Bengali": "ben_Beng",
294
+ "Bhojpuri": "bho_Deva",
295
+ "Banjar (Arabic script)": "bjn_Arab",
296
+ "Banjar (Latin script)": "bjn_Latn",
297
+ "Tibetan": "bod_Tibt",
298
+ "Bosnian": "bos_Latn",
299
+ "Buginese": "bug_Latn",
300
+ "Bulgarian": "bul_Cyrl",
301
+ "Catalan": "cat_Latn",
302
+ "Cebuano": "ceb_Latn",
303
+ "Czech": "ces_Latn",
304
+ "Chokwe": "cjk_Latn",
305
+ "Central Kurdish": "ckb_Arab",
306
+ "Crimean Tatar": "crh_Latn",
307
+ "Welsh": "cym_Latn",
308
+ "Danish": "dan_Latn",
309
+ "German": "deu_Latn",
310
+ "Dinka": "dik_Latn",
311
+ "Jula": "dyu_Latn",
312
+ "Dzongkha": "dzo_Tibt",
313
+ "Greek": "ell_Grek",
314
+ "English": "eng_Latn",
315
+ "Esperanto": "epo_Latn",
316
+ "Estonian": "est_Latn",
317
+ "Basque": "eus_Latn",
318
+ "Ewe": "ewe_Latn",
319
+ "Faroese": "fao_Latn",
320
+ "Persian": "pes_Arab",
321
+ "Fijian": "fij_Latn",
322
+ "Finnish": "fin_Latn",
323
+ "Fon": "fon_Latn",
324
+ "French": "fra_Latn",
325
+ "Friulian": "fur_Latn",
326
+ "Nigerian Fulfulde": "fuv_Latn",
327
+ "Scottish Gaelic": "gla_Latn",
328
+ "Irish": "gle_Latn",
329
+ "Galician": "glg_Latn",
330
+ "Guarani": "grn_Latn",
331
+ "Gujarati": "guj_Gujr",
332
+ "Haitian Creole": "hat_Latn",
333
+ "Hausa": "hau_Latn",
334
+ "Hebrew": "heb_Hebr",
335
+ "Hindi": "hin_Deva",
336
+ "Chhattisgarhi": "hne_Deva",
337
+ "Croatian": "hrv_Latn",
338
+ "Hungarian": "hun_Latn",
339
+ "Armenian": "hye_Armn",
340
+ "Igbo": "ibo_Latn",
341
+ "Iloko": "ilo_Latn",
342
+ "Indonesian": "ind_Latn",
343
+ "Icelandic": "isl_Latn",
344
+ "Italian": "ita_Latn",
345
+ "Javanese": "jav_Latn",
346
+ "Japanese": "jpn_Jpan",
347
+ "Kabyle": "kab_Latn",
348
+ "Kachin": "kac_Latn",
349
+ "Arabic": "ar_AR",
350
+ "Chinese": "zho_Hans",
351
+ "Spanish": "spa_Latn",
352
+ "Dutch": "nld_Latn",
353
+ "Kazakh": "kaz_Cyrl",
354
+ "Korean": "kor_Hang",
355
+ "Lithuanian": "lit_Latn",
356
+ "Malayalam": "mal_Mlym",
357
+ "Marathi": "mar_Deva",
358
+ "Nepali": "ne_NP",
359
+ "Polish": "pol_Latn",
360
+ "Portuguese": "por_Latn",
361
+ "Russian": "rus_Cyrl",
362
+ "Sinhala": "sin_Sinh",
363
+ "Tamil": "tam_Taml",
364
+ "Turkish": "tur_Latn",
365
+ "Ukrainian": "ukr_Cyrl",
366
+ "Urdu": "urd_Arab",
367
+ "Vietnamese": "vie_Latn",
368
+ "Thai":"tha_Thai",
369
+ "Khmer":"khm_Khmr"
370
+ }
371
+ return d[original_language]
372
+ def process_gpu_translate_result(temp_outputs):
373
+ outputs = []
374
+ for temp_output in temp_outputs:
375
+ length = len(temp_output[0]["generated_translation"])
376
+ for i in range(length):
377
+ temp = []
378
+ for trans in temp_output:
379
+ temp.append({
380
+ "target_language": trans["target_language"],
381
+ "generated_translation": trans['generated_translation'][i],
382
+ })
383
+ outputs.append(temp)
384
+ excel_writer = ExcelFileWriter()
385
+ excel_writer.write_text(os.path.join(parent_dir,r"temp/empty.xlsx"), outputs, 'A', 1, len(outputs))
386
+ self.tokenizer.src_lang = language_mapping(original_language)
387
+ if self.device_name == "cpu":
388
+ # Tokenize input
389
+ input_ids = self.tokenizer(inputs, return_tensors="pt", padding=True, max_length=128).to(self.device_name)
390
+ output = []
391
+ for target_language in target_languages:
392
+ # Get language code for the target language
393
+ target_lang_code = self.tokenizer.lang_code_to_id[language_mapping(target_language)]
394
+ # Generate translation
395
+ generated_tokens = self.model.generate(
396
+ **input_ids,
397
+ forced_bos_token_id=target_lang_code,
398
+ max_length=128
399
+ )
400
+ generated_translation = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
401
+ # Append result to output
402
+ output.append({
403
+ "target_language": target_language,
404
+ "generated_translation": generated_translation,
405
+ })
406
+ outputs = []
407
+ length = len(output[0]["generated_translation"])
408
+ for i in range(length):
409
+ temp = []
410
+ for trans in output:
411
+ temp.append({
412
+ "target_language": trans["target_language"],
413
+ "generated_translation": trans['generated_translation'][i],
414
+ })
415
+ outputs.append(temp)
416
+ return outputs
417
+ else:
418
+ # 最大批量大小 = 可用 GPU 内存字节数 / 4 / (张量大小 + 可训练参数)
419
+ # max_batch_size = 10
420
+ # Ensure batch size is within model limits:
421
+ print("length of inputs: ",len(inputs))
422
+ batch_size = min(len(inputs), int(max_batch_size))
423
+ batches = [inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)]
424
+ print("length of batches size: ", len(batches))
425
+ temp_outputs = []
426
+ processed_num = 0
427
+ for index, batch in enumerate(batches):
428
+ # Tokenize input
429
+ print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
430
+ print(len(batch))
431
+ print(batch)
432
+ batch = filter_pipeline.batch_encoder(batch)
433
+ print(batch)
434
+ temp = []
435
+ if len(batch) > 0:
436
+ input_ids = self.tokenizer(batch, return_tensors="pt", padding=True).to(self.device_name)
437
+ for target_language in target_languages:
438
+ target_lang_code = self.tokenizer.lang_code_to_id[language_mapping(target_language)]
439
+ generated_tokens = self.model.generate(
440
+ **input_ids,
441
+ forced_bos_token_id=target_lang_code,
442
+ )
443
+ generated_translation = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
444
+
445
+ print(generated_translation)
446
+ generated_translation = filter_pipeline.batch_decoder(generated_translation)
447
+ print(generated_translation)
448
+ print(len(generated_translation))
449
+ # Append result to output
450
+ temp.append({
451
+ "target_language": target_language,
452
+ "generated_translation": generated_translation,
453
+ })
454
+ input_ids.to('cpu')
455
+ del input_ids
456
+ else:
457
+ for target_language in target_languages:
458
+ generated_translation = filter_pipeline.batch_decoder(batch)
459
+ print(generated_translation)
460
+ print(len(generated_translation))
461
+ # Append result to output
462
+ temp.append({
463
+ "target_language": target_language,
464
+ "generated_translation": generated_translation,
465
+ })
466
+ temp_outputs.append(temp)
467
+ processed_num += len(batch)
468
+ if (index + 1) * max_batch_size // 1000 - index * max_batch_size // 1000 == 1:
469
+ print("Already processed number: ", len(temp_outputs))
470
+ process_gpu_translate_result(temp_outputs)
471
+ outputs = []
472
+ for temp_output in temp_outputs:
473
+ length = len(temp_output[0]["generated_translation"])
474
+ for i in range(length):
475
+ temp = []
476
+ for trans in temp_output:
477
+ temp.append({
478
+ "target_language": trans["target_language"],
479
+ "generated_translation": trans['generated_translation'][i],
480
+ })
481
+ outputs.append(temp)
482
+ return outputs
483
+ for filter in self._filter_list:
484
+ inputs = filter.encoder(inputs)
485
+ return inputs
486
+
487
+ def batch_decoder(self, inputs):
488
+ for filter in reversed(self._filter_list):
489
+ inputs = filter.decoder(inputs)
490
+ return inputs
491
+
pinyin.txt ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a
2
+ ai
3
+ an
4
+ ang
5
+ ao
6
+ ba
7
+ bai
8
+ ban
9
+ bang
10
+ bao
11
+ bei
12
+ ben
13
+ beng
14
+ bi
15
+ bian
16
+ biao
17
+ bie
18
+ bin
19
+ bing
20
+ bo
21
+ bu
22
+ ca
23
+ cai
24
+ can
25
+ cang
26
+ cao
27
+ ce
28
+ cen
29
+ ceng
30
+ cha
31
+ chai
32
+ chan
33
+ chang
34
+ chao
35
+ che
36
+ chen
37
+ cheng
38
+ chi
39
+ chong
40
+ chou
41
+ chu
42
+ chua
43
+ chuai
44
+ chuan
45
+ chuang
46
+ chui
47
+ chun
48
+ chuo
49
+ ci
50
+ cong
51
+ cou
52
+ cu
53
+ cuan
54
+ cui
55
+ cun
56
+ cuo
57
+ da
58
+ dai
59
+ dan
60
+ dang
61
+ dao
62
+ de
63
+ dei
64
+ den
65
+ deng
66
+ di
67
+ dia
68
+ dian
69
+ diao
70
+ die
71
+ ding
72
+ diu
73
+ dong
74
+ dou
75
+ du
76
+ duan
77
+ dui
78
+ dun
79
+ duo
80
+ e
81
+ ei
82
+ en
83
+ eng
84
+ er
85
+ fa
86
+ fan
87
+ fang
88
+ fei
89
+ fen
90
+ feng
91
+ fo
92
+ fou
93
+ fu
94
+ ga
95
+ gai
96
+ gan
97
+ gang
98
+ gao
99
+ ge
100
+ gei
101
+ gen
102
+ geng
103
+ gong
104
+ gou
105
+ gu
106
+ gua
107
+ guai
108
+ guan
109
+ guang
110
+ gui
111
+ gun
112
+ guo
113
+ ha
114
+ hai
115
+ han
116
+ hang
117
+ hao
118
+ he
119
+ hei
120
+ hen
121
+ heng
122
+ hong
123
+ hou
124
+ hu
125
+ hua
126
+ huai
127
+ huan
128
+ huang
129
+ hui
130
+ hun
131
+ huo
132
+ ji
133
+ jia
134
+ jian
135
+ jiang
136
+ jiao
137
+ jie
138
+ jin
139
+ jing
140
+ jiong
141
+ jiu
142
+ ju
143
+ juan
144
+ jue
145
+ jun
146
+ ka
147
+ kai
148
+ kan
149
+ kang
150
+ kao
151
+ ke
152
+ ken
153
+ keng
154
+ kong
155
+ kou
156
+ ku
157
+ kua
158
+ kuai
159
+ kuan
160
+ kuang
161
+ kui
162
+ kun
163
+ kuo
164
+ la
165
+ lai
166
+ lan
167
+ lang
168
+ lao
169
+ le
170
+ lei
171
+ leng
172
+ li
173
+ lia
174
+ lian
175
+ liang
176
+ liao
177
+ lie
178
+ lin
179
+ ling
180
+ liu
181
+ long
182
+ lou
183
+ lu
184
+ luan
185
+
186
+ lüe
187
+ lun
188
+ luo
189
+ ma
190
+ mai
191
+ man
192
+ mang
193
+ mao
194
+ me
195
+ mei
196
+ men
197
+ meng
198
+ mi
199
+ mian
200
+ miao
201
+ mie
202
+ min
203
+ ming
204
+ miu
205
+ mo
206
+ mou
207
+ mu
208
+ na
209
+ nai
210
+ nan
211
+ nang
212
+ nao
213
+ ne
214
+ nei
215
+ nen
216
+ neng
217
+ ni
218
+ nian
219
+ niang
220
+ niao
221
+ nie
222
+ nin
223
+ ning
224
+ niu
225
+ nong
226
+ nou
227
+ nu
228
+
229
+ nuan
230
+ nüe
231
+ nuo
232
+ nun
233
+ o
234
+ ou
235
+ pa
236
+ pai
237
+ pan
238
+ pang
239
+ pao
240
+ pei
241
+ pen
242
+ peng
243
+ pi
244
+ pian
245
+ piao
246
+ pie
247
+ pin
248
+ ping
249
+ po
250
+ pou
251
+ pu
252
+ qi
253
+ qia
254
+ qian
255
+ qiang
256
+ qiao
257
+ qie
258
+ qin
259
+ qing
260
+ qiong
261
+ qiu
262
+ qu
263
+ quan
264
+ que
265
+ qun
266
+ ran
267
+ rang
268
+ rao
269
+ re
270
+ ren
271
+ reng
272
+ ri
273
+ rong
274
+ rou
275
+ ru
276
+ ruan
277
+ rui
278
+ run
279
+ ruo
280
+ sa
281
+ sai
282
+ san
283
+ sang
284
+ sao
285
+ se
286
+ sen
287
+ seng
288
+ sha
289
+ shai
290
+ shan
291
+ shang
292
+ shao
293
+ she
294
+ shei
295
+ shen
296
+ sheng
297
+ shi
298
+ shou
299
+ shu
300
+ shua
301
+ shuai
302
+ shuan
303
+ shuang
304
+ shui
305
+ shun
306
+ shuo
307
+ si
308
+ song
309
+ sou
310
+ su
311
+ suan
312
+ sui
313
+ sun
314
+ suo
315
+ ta
316
+ tai
317
+ tan
318
+ tang
319
+ tao
320
+ te
321
+ teng
322
+ ti
323
+ tian
324
+ tiao
325
+ tie
326
+ ting
327
+ tong
328
+ tou
329
+ tu
330
+ tuan
331
+ tui
332
+ tun
333
+ tuo
334
+ wa
335
+ wai
336
+ wan
337
+ wang
338
+ wei
339
+ wen
340
+ weng
341
+ wo
342
+ wu
343
+ xi
344
+ xia
345
+ xian
346
+ xiang
347
+ xiao
348
+ xie
349
+ xin
350
+ xing
351
+ xiong
352
+ xiu
353
+ xu
354
+ xuan
355
+ xue
356
+ xun
357
+ ya
358
+ yan
359
+ yang
360
+ yao
361
+ ye
362
+ yi
363
+ yin
364
+ ying
365
+ yo
366
+ yong
367
+ you
368
+ yu
369
+ yuan
370
+ yue
371
+ yun
372
+ za
373
+ zai
374
+ zan
375
+ zang
376
+ zao
377
+ ze
378
+ zei
379
+ zen
380
+ zeng
381
+ zha
382
+ zhai
383
+ zhan
384
+ zhang
385
+ zhao
386
+ zhe
387
+ zhei
388
+ zhen
389
+ zheng
390
+ zhi
391
+ zhong
392
+ zhou
393
+ zhu
394
+ zhua
395
+ zhuai
396
+ zhuan
397
+ zhuang
398
+ zhui
399
+ zhun
400
+ zhuo
401
+ zi
402
+ zong
403
+ zou
404
+ zu
405
+ zuan
406
+ zui
407
+ zun
408
+ zuo
support_language.json ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "original_language":[
3
+ "Achinese (Arabic script)",
4
+ "Achinese (Latin script)",
5
+ "Afrikaans",
6
+ "Akan",
7
+ "Amharic",
8
+ "Arabic",
9
+ "Armenian",
10
+ "Assamese",
11
+ "Asturian",
12
+ "Awadhi",
13
+ "Balinese",
14
+ "Bambara",
15
+ "Banjar (Arabic script)",
16
+ "Banjar (Latin script)",
17
+ "Bashkir",
18
+ "Basque",
19
+ "Belarusian",
20
+ "Bemba",
21
+ "Bengali",
22
+ "Bhojpuri",
23
+ "Bosnian",
24
+ "Buginese",
25
+ "Bulgarian",
26
+ "Catalan",
27
+ "Cebuano",
28
+ "Central Aymara",
29
+ "Central Kurdish",
30
+ "Chhattisgarhi",
31
+ "Chinese",
32
+ "Chokwe",
33
+ "Crimean Tatar",
34
+ "Croatian",
35
+ "Czech",
36
+ "Danish",
37
+ "Dinka",
38
+ "Dutch",
39
+ "Dzongkha",
40
+ "Egyptian Arabic",
41
+ "English",
42
+ "Esperanto",
43
+ "Estonian",
44
+ "Ewe",
45
+ "Faroese",
46
+ "Fijian",
47
+ "Finnish",
48
+ "Fon",
49
+ "French",
50
+ "Friulian",
51
+ "Galician",
52
+ "German",
53
+ "Greek",
54
+ "Guarani",
55
+ "Gujarati",
56
+ "Haitian Creole",
57
+ "Hausa",
58
+ "Hebrew",
59
+ "Hindi",
60
+ "Hungarian",
61
+ "Icelandic",
62
+ "Igbo",
63
+ "Iloko",
64
+ "Indonesian",
65
+ "Irish",
66
+ "Italian",
67
+ "Japanese",
68
+ "Javanese",
69
+ "Jula",
70
+ "Kabyle",
71
+ "Kachin",
72
+ "Kazakh",
73
+ "Khmer",
74
+ "Korean",
75
+ "Lithuanian",
76
+ "Malayalam",
77
+ "Marathi",
78
+ "Mesopotamian Arabic",
79
+ "Moroccan Arabic",
80
+ "Najdi Arabic",
81
+ "Nepali",
82
+ "Nigerian Fulfulde",
83
+ "North Azerbaijani",
84
+ "North Levantine Arabic",
85
+ "Persian",
86
+ "Polish",
87
+ "Portuguese",
88
+ "Russian",
89
+ "Scottish Gaelic",
90
+ "Sinhala",
91
+ "South Azerbaijani",
92
+ "South Levantine Arabic",
93
+ "Spanish",
94
+ "Standard Arabic",
95
+ "Ta'izzi-Adeni Arabic",
96
+ "Tamil",
97
+ "Thai",
98
+ "Tibetan",
99
+ "Tunisian Arabic",
100
+ "Turkish",
101
+ "Ukrainian",
102
+ "Urdu",
103
+ "Vietnamese",
104
+ "Welsh"
105
+ ],
106
+ "target_language":[
107
+ "Achinese (Arabic script)",
108
+ "Achinese (Latin script)",
109
+ "Afrikaans",
110
+ "Akan",
111
+ "Amharic",
112
+ "Arabic",
113
+ "Armenian",
114
+ "Assamese",
115
+ "Asturian",
116
+ "Awadhi",
117
+ "Balinese",
118
+ "Bambara",
119
+ "Banjar (Arabic script)",
120
+ "Banjar (Latin script)",
121
+ "Bashkir",
122
+ "Basque",
123
+ "Belarusian",
124
+ "Bemba",
125
+ "Bengali",
126
+ "Bhojpuri",
127
+ "Bosnian",
128
+ "Buginese",
129
+ "Bulgarian",
130
+ "Catalan",
131
+ "Cebuano",
132
+ "Central Aymara",
133
+ "Central Kurdish",
134
+ "Chhattisgarhi",
135
+ "Chinese",
136
+ "Chokwe",
137
+ "Crimean Tatar",
138
+ "Croatian",
139
+ "Czech",
140
+ "Danish",
141
+ "Dinka",
142
+ "Dutch",
143
+ "Dzongkha",
144
+ "Egyptian Arabic",
145
+ "English",
146
+ "Esperanto",
147
+ "Estonian",
148
+ "Ewe",
149
+ "Faroese",
150
+ "Fijian",
151
+ "Finnish",
152
+ "Fon",
153
+ "French",
154
+ "Friulian",
155
+ "Galician",
156
+ "German",
157
+ "Greek",
158
+ "Guarani",
159
+ "Gujarati",
160
+ "Haitian Creole",
161
+ "Hausa",
162
+ "Hebrew",
163
+ "Hindi",
164
+ "Hungarian",
165
+ "Icelandic",
166
+ "Igbo",
167
+ "Iloko",
168
+ "Indonesian",
169
+ "Irish",
170
+ "Italian",
171
+ "Japanese",
172
+ "Javanese",
173
+ "Jula",
174
+ "Kabyle",
175
+ "Kachin",
176
+ "Kazakh",
177
+ "Khmer",
178
+ "Korean",
179
+ "Lithuanian",
180
+ "Malayalam",
181
+ "Marathi",
182
+ "Mesopotamian Arabic",
183
+ "Moroccan Arabic",
184
+ "Najdi Arabic",
185
+ "Nepali",
186
+ "Nigerian Fulfulde",
187
+ "North Azerbaijani",
188
+ "North Levantine Arabic",
189
+ "Persian",
190
+ "Polish",
191
+ "Portuguese",
192
+ "Russian",
193
+ "Scottish Gaelic",
194
+ "Sinhala",
195
+ "South Azerbaijani",
196
+ "South Levantine Arabic",
197
+ "Spanish",
198
+ "Standard Arabic",
199
+ "Ta'izzi-Adeni Arabic",
200
+ "Tamil",
201
+ "Thai",
202
+ "Tibetan",
203
+ "Tunisian Arabic",
204
+ "Turkish",
205
+ "Ukrainian",
206
+ "Urdu",
207
+ "Vietnamese",
208
+ "Welsh"
209
+ ]
210
+ }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f1dd90d797537c6417e700eed2339aabf4272377d4fdce4a2c982061690fd04
3
- size 17331547
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea576e70b8821af785b2752bbe892ab616e85497f8726b368c22aa03ee5a2d78
3
+ size 17331288
tokenizer_config.json CHANGED
@@ -1871,7 +1871,7 @@
1871
  "pad_token": "<pad>",
1872
  "sep_token": "</s>",
1873
  "sp_model_kwargs": {},
1874
- "src_lang": "zho_Hans",
1875
  "tgt_lang": null,
1876
  "tokenizer_class": "NllbTokenizer",
1877
  "unk_token": "<unk>"
 
1871
  "pad_token": "<pad>",
1872
  "sep_token": "</s>",
1873
  "sp_model_kwargs": {},
1874
+ "src_lang": "khm_Khmr",
1875
  "tgt_lang": null,
1876
  "tokenizer_class": "NllbTokenizer",
1877
  "unk_token": "<unk>"