from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import re

# Path to your model's checkpoints
model_checkpoint_path = "barbaroo/nllb_200_600M_fo_en"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_path)

def split_into_sentences(text):
    # This simple function splits text into sentences using regular expressions
    # that capture punctuation marks followed by space and a capital letter.
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
    return sentences

def translate(text, model, tokenizer, max_length=80):
    # Split long text into sentences
    sentences = split_into_sentences(text)
    translated_text = []

    # Process each sentence separately
    for sentence in sentences:
        inputs = tokenizer.encode(sentence, return_tensors="pt", max_length=max_length, truncation=True)
        outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
        translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translated_text.append(translated_sentence)

    # Join the translated sentences back into a single string
    return " ".join(translated_text)

# Gradio Interface setup
# Ensure Gradio is installed

# Importing Gradio
import gradio as gr

# Define the Gradio interface
def gradio_translate(text):
    return translate(text, model, tokenizer)

iface = gr.Interface(fn=gradio_translate, 
                     inputs="text", 
                     outputs="text", 
                     title="Faroese to English Translator",
                     description="Translate Faroese text to English using a state-of-the-art model.")

# Launch the interface
iface.launch()