import gradio as gr from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer model_name = "dsfsi/nso-en-m2m100-gov" tokenizer = M2M100Tokenizer.from_pretrained(model_name) model = M2M100ForConditionalGeneration.from_pretrained(model_name) tokenizer.src_lang = "ns" model.config.forced_bos_token_id = tokenizer.get_lang_id("en") def translate(inp): inputs = tokenizer(inp, return_tensors="pt") translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en")) translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True) return translated_text with gr.Blocks() as demo: with gr.Row(): with gr.Column(scale=1): pass with gr.Column(scale=4, min_width=1000): gr.Image("logo_transparent_small.png", elem_id="logo", show_label=False, width=500) gr.Markdown("""

Northern Sotho to English Translation

This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts.

""") with gr.Column(scale=1): pass with gr.Column(variant="panel"): inp_text = gr.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input", elem_id="centered-input") output_text = gr.Textbox(label="Output", elem_id="centered-output") translate_button = gr.Button("Translate", elem_id="centered-button") translate_button.click(translate, inputs=inp_text, outputs=output_text) gr.Markdown("""
GitHub | Feedback Form | Arxiv

""") with gr.Accordion("More Information", open=False): gr.Markdown("""

Model Description

This is a variant of the M2M100 model, fine-tuned on a multilingual dataset to support translation from Northern Sotho (Sepedi) to English. The model was trained with a focus on improving translation accuracy for low-resource languages.

""") gr.Markdown("""

Authors

Vukosi Marivate, Matimba Shingange, Richard Lastrucci, Isheanesu Joseph Dzingirai, Jenalea Rajab
""") gr.Markdown("""

Citation

        @inproceedings{lastrucci-etal-2023-preparing,
            title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora",
            author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab 
                      and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
            booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
            month = may,
            year = "2023",
            address = "Dubrovnik, Croatia",
            publisher = "Association for Computational Linguistics",
            url = "https://aclanthology.org/2023.rail-1.3",
            pages = "18--25"
        }
        
""") gr.Markdown("""

DOI

10.48550/arXiv.2303.03750
""") demo.launch()