Spaces:

anzorq
/

zedzek

Running on CPU Upgrade

File size: 3,550 Bytes

bd9805e
 
dfd6c72
a4d0b27
dfd6c72
 
 
a4d0b27
dfd6c72
 
 
a4d0b27
1884e2f
dfd6c72
 
 
 
 
 
 
 
 
 
 
 
f751f4e
112c6bb
b272afc
f751f4e
dfd6c72
 
ac0fe1d
f751f4e
dfd6c72
 
 
 
 
 
a4d0b27
bd9805e
3e2e722
bd9805e

import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import fasttext

# Initialize fastText model
model_path = 'lid.323.ftz'
language_model = fasttext.load_model(model_path)

model_path_translation = "anzorq/m2m100_418M_ft_ru-kbd_44K"
tokenizer = AutoTokenizer.from_pretrained(model_path_translation)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path_translation, use_safetensors=True)

def translate(text, num_beams=4, num_return_sequences=4):
    # Detect language
    languages, _ = language_model.predict(text, k=1)
    detected_language = languages[0].replace("__label__", "")
    
    inputs = tokenizer(text, return_tensors="pt")
    num_return_sequences = min(num_return_sequences, num_beams)
    translated_tokens = model.generate(
        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang], num_beams=num_beams, num_return_sequences=num_return_sequences
    )
    translations = [tokenizer.decode(translation, skip_special_tokens=True) for translation in translated_tokens]
    
    return detected_language, text, translations

title = "Russian-Circassian translator demo"
article = "<p style='text-align: center'>Want to help? Join the <a href='https://discord.gg/cXwv495r' target='_blank'>Discord server</a></p>"

num_beams = gr.inputs.Slider(2, 10, step=1, label="Number of beams", default=4)
num_return_sequences = gr.inputs.Slider(2, 10, step=1, label="Number of returned sentences", default=4)

gr.Interface(
    fn=translate,
    inputs=["text", num_beams, num_return_sequences],
    outputs=["text", "text", gr.Textbox()],
    titles=["Detected Language", "Input", "Translations"],
    title=title,
    article=article).launch()

# import gradio as gr

# title = "Русско-черкесский переводчик"
# description = "Demo of a Russian-Circassian (Kabardian dialect) translator. <br>It is based on Facebook's <a href=\"https://about.fb.com/news/2020/10/first-multilingual-machine-translation-model/\">M2M-100 model</a> machine learning model, and has been trained on 45,000 Russian-Circassian sentence pairs. <br>It can also translate from 100 other languages to Circassian (English, French, Spanish, etc.), but less accurately. <br>The data corpus is constantly being expanded, and we need help in finding sentence sources, OCR, data cleaning, etc. <br>If you are interested in helping out with this project, please contact me at the link below.<br><br>This is only a demo, not a finished product. Translation quality is still low and will improve with time and more data.<br>45,000 sentence pairs is not enough to create an accurate machine translation model, and more data is needed.<br>You can help by finding sentence sources (books, web pages, etc.), scanning books, OCRing documents, data cleaning, and other tasks.<br><br>If you are interested in helping out with this project, contact me at the link below."
# article = """<p style='text-align: center'><a href='https://arxiv.org/abs/1806.00187'>Scaling Neural Machine Translation</a> | <a href='https://github.com/pytorch/fairseq/'>Github Repo</a></p>"""

# examples = [
#     ["Мы идем домой"],
#     ["Сегодня хорошая погода"],
#     ["Дети играют во дворе"],
#     ["We live in a big house"],
#     ["Tu es une bonne personne."],
#     ["أين تعيش؟"],
#     ["Bir şeyler yapmak istiyorum."],
# ]

# gr.Interface.load("models/anzorq/m2m100_418M_ft_ru-kbd_44K", title=title, description=description, article=article, examples=examples).launch()