Spaces:
Sleeping
Sleeping
File size: 4,054 Bytes
3d4f13a b5f7961 3d4f13a 4f63972 ef9b88b 3b68341 ef9b88b b02baad 3b68341 4f70f9f b5f7961 3d4f13a 3363f16 8076cbb 263fe73 0e7ef21 263fe73 692156b 0e7ef21 263fe73 0e7ef21 263fe73 0e7ef21 263fe73 0e7ef21 263fe73 0e7ef21 263fe73 0e7ef21 263fe73 692156b 3363f16 dbeaa8d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import gradio as gr
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
model_name = "dsfsi/nso-en-m2m100-gov"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer.src_lang = "ns"
model.config.forced_bos_token_id = tokenizer.get_lang_id("en")
def translate(inp):
inputs = tokenizer(inp, return_tensors="pt")
translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en"))
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
return translated_text
with gr.Blocks() as demo:
with gr.Row():
with gr.Column(scale=1):
pass
with gr.Column(scale=4, min_width=1000):
gr.Image("logo_transparent_small.png", elem_id="logo", show_label=False, width=500)
gr.Markdown("""
<h1 style='text-align: center;'>Northern Sotho to English Translation</h1>
<p style='text-align: center;'>This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts.</p>
""")
with gr.Column(scale=1):
pass
with gr.Column(variant="panel"):
inp_text = gr.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input", elem_id="centered-input")
output_text = gr.Textbox(label="Output", elem_id="centered-output")
translate_button = gr.Button("Translate", elem_id="centered-button")
translate_button.click(translate, inputs=inp_text, outputs=output_text)
gr.Markdown("""
<div style='text-align: center;'>
<a href='https://github.com/dsfsi/nso-en-m2m100-gov' target='_blank'>GitHub</a> |
<a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank'>Feedback Form</a> |
<a href='https://arxiv.org/abs/2303.03750' target='_blank'>Arxiv</a>
</div>
<br/>
""")
with gr.Accordion("More Information", open=False):
gr.Markdown("""
<h4 style="text-align: center;">Model Description</h4>
<p style='text-align: center;'>This is a variant of the M2M100 model, fine-tuned on a multilingual dataset to support translation from Northern Sotho (Sepedi) to English. The model was trained with a focus on improving translation accuracy for low-resource languages.</p>
""")
gr.Markdown("""
<h4 style="text-align: center;">Authors</h4>
<div style='text-align: center;'>
Vukosi Marivate, Matimba Shingange, Richard Lastrucci,
Isheanesu Joseph Dzingirai, Jenalea Rajab
</div>
""")
gr.Markdown("""
<h4 style="text-align: center;">Citation</h4>
<pre style="text-align: center; white-space: pre-wrap;">
@inproceedings{lastrucci-etal-2023-preparing,
title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora",
author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab
and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.rail-1.3",
pages = "18--25"
}
</pre>
""")
gr.Markdown("""
<h4 style="text-align: center;">DOI</h4>
<div style='text-align: center;'>
<a href="https://doi.org/10.48550/arXiv.2303.03750" target="_blank">10.48550/arXiv.2303.03750</a>
</div>
""")
demo.launch()
|