Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
model_name = "dsfsi/nso-en-m2m100-gov" | |
tokenizer = M2M100Tokenizer.from_pretrained(model_name) | |
model = M2M100ForConditionalGeneration.from_pretrained(model_name) | |
tokenizer.src_lang = "ns" | |
model.config.forced_bos_token_id = tokenizer.get_lang_id("en") | |
def translate(inp): | |
inputs = tokenizer(inp, return_tensors="pt") | |
translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en")) | |
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True) | |
return translated_text | |
with gr.Blocks() as demo: | |
gr.Markdown(""" | |
<div style='text-align: center;'> | |
<img src='file/logo_transparent_small.png' alt='Logo' width='150'/> | |
</div> | |
""") | |
gr.Markdown(""" | |
<h1 style='text-align: center;'>Northern Sotho to English Translation</h1> | |
<p style='text-align: center;'>This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts.</p> | |
""") | |
inp_text = gr.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input") | |
output_text = gr.Textbox(label="Output") | |
translate_button = gr.Button("Translate") | |
translate_button.click(translate, inputs=inp_text, outputs=output_text) | |
gr.Markdown(""" | |
<div style='text-align: center;'> | |
<a href='https://github.com/dsfsi/nso-en-m2m100-gov' target='_blank'>GitHub</a> | | |
<a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank'>Feedback Form</a> | | |
<a href='https://arxiv.org/abs/2303.03750' target='_blank'>Arxiv</a> | |
</div> | |
<br/> | |
""") | |
with gr.Accordion("More Information", open=False): | |
gr.Markdown(""" | |
<h4 style="text-align: center;">More information about the space</h4> | |
<p>This is a variant of the M2M100 model, fine-tuned on a multilingual dataset to support translation from Northern Sotho (Sepedi) to English. The model was trained with a focus on improving translation accuracy for low-resource languages.</p> | |
""") | |
gr.Markdown(""" | |
<div style='text-align: center;'> | |
Authors: Vukosi Marivate, Matimba Shingange, Richard Lastrucci, | |
Isheanesu Joseph Dzingirai, Jenalea Rajab | |
</div> | |
""") | |
gr.Markdown(""" | |
<pre style="text-align: left; white-space: pre-wrap;"> | |
@inproceedings{lastrucci-etal-2023-preparing, | |
title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora", | |
author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab | |
and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate", | |
booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)", | |
month = may, | |
year = "2023", | |
address = "Dubrovnik, Croatia", | |
publisher = "Association for Computational Linguistics", | |
url = "https://aclanthology.org/2023.rail-1.3", | |
pages = "18--25" | |
} | |
</pre> | |
""") | |
gr.Markdown(""" | |
<div style='text-align: center;'> | |
DOI: <a href="https://doi.org/10.48550/arXiv.2303.03750" target="_blank">10.48550/arXiv.2303.03750</a> | |
</div> | |
""") | |
demo.launch() | |