Spaces:
Running
Running
import os | |
import ctranslate2 | |
import gradio as gr | |
from huggingface_hub import snapshot_download | |
from sentencepiece import SentencePieceProcessor | |
title = "MADLAD-400 Translation Demo" | |
description = """ | |
<p> | |
Translator using <a href='https://arxiv.org/abs/2309.04662' target='_blank'>MADLAD-400</a>, a multilingual machine translation model on 250 billion tokens covering over 450 languages using publicly available data. This demo application uses <a href="https://huggingface.co/santhosh/madlad400-3b-ct2">santhosh/madlad400-3b-ct2</a> model, which is a ctranslate2 optimized model of <a href="https://huggingface.co/google/madlad400-3b-mt">google/madlad400-3b-mt</a> | |
</p> | |
""" | |
# As per https://opennmt.net/CTranslate2/performance.html | |
# By default CTranslate2 is compiled with intel MKL. | |
# It is observed that this setting has a significant positive performance impact. | |
os.environ["CT2_USE_EXPERIMENTAL_PACKED_GEMM"] = "1" | |
model_name = "santhosh/madlad400-3b-ct2" | |
model_path = snapshot_download(model_name) | |
tokenizer = SentencePieceProcessor() | |
tokenizer.load(f"{model_path}/sentencepiece.model") | |
translator = ctranslate2.Translator(model_path) | |
tokens = [tokenizer.decode(i) for i in range(460)] | |
lang_codes = [token[2:-1] for token in tokens if token.startswith("<2")] | |
def translate(input_text, target_language): | |
input_tokens = tokenizer.encode(f"<2{target_language}> {input_text}", out_type=str) | |
results = translator.translate_batch( | |
[input_tokens], | |
batch_type="tokens", | |
# max_batch_size=1024, | |
beam_size=1, | |
no_repeat_ngram_size=1, | |
# repetition_penalty=2, | |
) | |
translated_sentence = tokenizer.decode(results[0].hypotheses[0]) | |
return translated_sentence | |
def translate_interface(input_text, target_language): | |
translated_text = translate(input_text, target_language) | |
return translated_text | |
input_text = gr.Textbox( | |
label="Input Text", | |
value="Imagine a world in which every single person on the planet is given free access to the sum of all human knowledge.", | |
) | |
target_language = gr.Dropdown(lang_codes, value="ml", label="Target Language") | |
output_text = gr.Textbox(label="Translated Text") | |
gr.Interface( | |
title=title, | |
description=description, | |
fn=translate_interface, | |
inputs=[input_text, target_language], | |
outputs=output_text, | |
).launch() | |