barbaroo's picture
Update app.py
a9a2d8a verified
raw
history blame
No virus
1.81 kB
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import re
# Path to your model's checkpoints
model_checkpoint_path = "barbaroo/nllb_200_600M_fo_en"
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_path)
def split_into_sentences(text):
# This simple function splits text into sentences using regular expressions
# that capture punctuation marks followed by space and a capital letter.
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
return sentences
def translate(text, model, tokenizer, max_length=80):
# Split long text into sentences
sentences = split_into_sentences(text)
translated_text = []
# Process each sentence separately
for sentence in sentences:
inputs = tokenizer.encode(sentence, return_tensors="pt", max_length=max_length, truncation=True)
outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
translated_text.append(translated_sentence)
# Join the translated sentences back into a single string
return " ".join(translated_text)
# Gradio Interface setup
# Ensure Gradio is installed
# Importing Gradio
import gradio as gr
# Define the Gradio interface
def gradio_translate(text):
return translate(text, model, tokenizer)
iface = gr.Interface(fn=gradio_translate,
inputs="text",
outputs="text",
title="Faroese to English Translator",
description="Translate Faroese text to English using a state-of-the-art model.")
# Launch the interface
iface.launch()