File size: 4,054 Bytes
3d4f13a
b5f7961
3d4f13a
4f63972
ef9b88b
3b68341
ef9b88b
b02baad
3b68341
 
4f70f9f
b5f7961
 
 
 
3d4f13a
3363f16
8076cbb
 
 
 
 
 
 
 
 
 
 
263fe73
0e7ef21
 
 
 
 
263fe73
 
 
 
 
 
 
 
 
 
692156b
 
0e7ef21
 
263fe73
 
0e7ef21
263fe73
0e7ef21
263fe73
 
 
 
0e7ef21
 
263fe73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e7ef21
263fe73
0e7ef21
263fe73
692156b
3363f16
dbeaa8d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "dsfsi/nso-en-m2m100-gov"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

tokenizer.src_lang = "ns"
model.config.forced_bos_token_id = tokenizer.get_lang_id("en")

def translate(inp):
    inputs = tokenizer(inp, return_tensors="pt")
    translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en"))
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column(scale=1):  
            pass
        with gr.Column(scale=4, min_width=1000): 
            gr.Image("logo_transparent_small.png", elem_id="logo", show_label=False, width=500)
            gr.Markdown("""
            <h1 style='text-align: center;'>Northern Sotho to English Translation</h1>
            <p style='text-align: center;'>This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts.</p>
            """)
        with gr.Column(scale=1):  
            pass
    
    with gr.Column(variant="panel"):
        inp_text = gr.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input", elem_id="centered-input")
        output_text = gr.Textbox(label="Output", elem_id="centered-output")
        translate_button = gr.Button("Translate", elem_id="centered-button")
        translate_button.click(translate, inputs=inp_text, outputs=output_text)
    
    gr.Markdown("""
    <div style='text-align: center;'>
        <a href='https://github.com/dsfsi/nso-en-m2m100-gov' target='_blank'>GitHub</a> |
        <a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank'>Feedback Form</a> |
        <a href='https://arxiv.org/abs/2303.03750' target='_blank'>Arxiv</a>
    </div>
    <br/>
    """)
    
    with gr.Accordion("More Information", open=False):
        gr.Markdown("""
        <h4 style="text-align: center;">Model Description</h4>
        <p style='text-align: center;'>This is a variant of the M2M100 model, fine-tuned on a multilingual dataset to support translation from Northern Sotho (Sepedi) to English. The model was trained with a focus on improving translation accuracy for low-resource languages.</p>
        """)
        gr.Markdown("""
        <h4 style="text-align: center;">Authors</h4>
        <div style='text-align: center;'>
            Vukosi Marivate, Matimba Shingange, Richard Lastrucci, 
            Isheanesu Joseph Dzingirai, Jenalea Rajab
        </div>
        """)
        gr.Markdown("""
        <h4 style="text-align: center;">Citation</h4>
        <pre style="text-align: center; white-space: pre-wrap;">
        @inproceedings{lastrucci-etal-2023-preparing,
            title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora",
            author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab 
                      and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
            booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
            month = may,
            year = "2023",
            address = "Dubrovnik, Croatia",
            publisher = "Association for Computational Linguistics",
            url = "https://aclanthology.org/2023.rail-1.3",
            pages = "18--25"
        }
        </pre>
        """)
        gr.Markdown("""
        <h4 style="text-align: center;">DOI</h4>
        <div style='text-align: center;'>
            <a href="https://doi.org/10.48550/arXiv.2303.03750" target="_blank">10.48550/arXiv.2303.03750</a>
        </div>
        """)

demo.launch()