import gradio as gr from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer model_name = "dsfsi/nso-en-m2m100-gov" tokenizer = M2M100Tokenizer.from_pretrained(model_name) model = M2M100ForConditionalGeneration.from_pretrained(model_name) tokenizer.src_lang = "ns" model.config.forced_bos_token_id = tokenizer.get_lang_id("en") def translate(inp): inputs = tokenizer(inp, return_tensors="pt") translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en")) translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True) return translated_text with gr.Blocks() as demo: with gr.Row(): with gr.Column(scale=1): pass with gr.Column(scale=4, min_width=1000): gr.Image("logo_transparent_small.png", elem_id="logo", show_label=False, width=500) gr.Markdown("""
This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts.
""") with gr.Column(scale=1): pass with gr.Column(variant="panel"): inp_text = gr.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input", elem_id="centered-input") output_text = gr.Textbox(label="Output", elem_id="centered-output") translate_button = gr.Button("Translate", elem_id="centered-button") translate_button.click(translate, inputs=inp_text, outputs=output_text) gr.Markdown("""This is a variant of the M2M100 model, fine-tuned on a multilingual dataset to support translation from Northern Sotho (Sepedi) to English. The model was trained with a focus on improving translation accuracy for low-resource languages.
""") gr.Markdown("""@inproceedings{lastrucci-etal-2023-preparing, title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora", author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate", booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)", month = may, year = "2023", address = "Dubrovnik, Croatia", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.rail-1.3", pages = "18--25" }""") gr.Markdown("""