Spaces:
Runtime error
Runtime error
# coding=utf-8 | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
dict_map = { | |
"òa": "oà", | |
"Òa": "Oà", | |
"ÒA": "OÀ", | |
"óa": "oá", | |
"Óa": "Oá", | |
"ÓA": "OÁ", | |
"ỏa": "oả", | |
"Ỏa": "Oả", | |
"ỎA": "OẢ", | |
"õa": "oã", | |
"Õa": "Oã", | |
"ÕA": "OÃ", | |
"ọa": "oạ", | |
"Ọa": "Oạ", | |
"ỌA": "OẠ", | |
"òe": "oè", | |
"Òe": "Oè", | |
"ÒE": "OÈ", | |
"óe": "oé", | |
"Óe": "Oé", | |
"ÓE": "OÉ", | |
"ỏe": "oẻ", | |
"Ỏe": "Oẻ", | |
"ỎE": "OẺ", | |
"õe": "oẽ", | |
"Õe": "Oẽ", | |
"ÕE": "OẼ", | |
"ọe": "oẹ", | |
"Ọe": "Oẹ", | |
"ỌE": "OẸ", | |
"ùy": "uỳ", | |
"Ùy": "Uỳ", | |
"ÙY": "UỲ", | |
"úy": "uý", | |
"Úy": "Uý", | |
"ÚY": "UÝ", | |
"ủy": "uỷ", | |
"Ủy": "Uỷ", | |
"ỦY": "UỶ", | |
"ũy": "uỹ", | |
"Ũy": "Uỹ", | |
"ŨY": "UỸ", | |
"ụy": "uỵ", | |
"Ụy": "Uỵ", | |
"ỤY": "UỴ", | |
} | |
tokenizer_vi2en = AutoTokenizer.from_pretrained("vinai/vinai-translate-vi2en", src_lang="vi_VN") | |
model_vi2en = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-vi2en") | |
def translate_vi2en(vi_text: str) -> str: | |
for i, j in dict_map.items(): | |
vi_text = vi_text.replace(i, j) | |
input_ids = tokenizer_vi2en(vi_text, return_tensors="pt").input_ids | |
output_ids = model_vi2en.generate( | |
input_ids, | |
decoder_start_token_id=tokenizer_vi2en.lang_code_to_id["en_XX"], | |
num_return_sequences=1, | |
# # With sampling | |
# do_sample=True, | |
# top_k=100, | |
# top_p=0.8, | |
# With beam search | |
num_beams=5, | |
early_stopping=True | |
) | |
en_text = tokenizer_vi2en.batch_decode(output_ids, skip_special_tokens=True) | |
en_text = " ".join(en_text) | |
return en_text | |
tokenizer_en2vi = AutoTokenizer.from_pretrained("vinai/vinai-translate-en2vi", src_lang="en_XX") | |
model_en2vi = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-en2vi") | |
def translate_en2vi(en_text: str) -> str: | |
input_ids = tokenizer_en2vi(en_text, return_tensors="pt").input_ids | |
output_ids = model_en2vi.generate( | |
input_ids, | |
decoder_start_token_id=tokenizer_en2vi.lang_code_to_id["vi_VN"], | |
num_return_sequences=1, | |
# # With sampling | |
# do_sample=True, | |
# top_k=100, | |
# top_p=0.8, | |
# With beam search | |
num_beams=5, | |
early_stopping=True | |
) | |
vi_text = tokenizer_en2vi.batch_decode(output_ids, skip_special_tokens=True) | |
vi_text = " ".join(vi_text) | |
return vi_text |