Spaces:

barbaroo
/

Faroese_English_Ukranian_Translator

Runtime error

App Files Files Community

barbaroo commited on May 17

Commit

8b08dff

•

1 Parent(s): 27eb948

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -20

app.py CHANGED Viewed

@@ -1,41 +1,31 @@
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-import re
-# Path to your model's checkpoints
 model_checkpoint_path = "barbaroo/nllb_200_600M_fo_en"
-# Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_path)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_path)
 def split_into_sentences(text):
-    # This simple function splits text into sentences using regular expressions
-    # that capture punctuation marks followed by space and a capital letter.
-    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
-    return sentences
 def translate(text, model, tokenizer, max_length=80):
-    # Split long text into sentences
     sentences = split_into_sentences(text)
     translated_text = []
-    # Process each sentence separately
     for sentence in sentences:
         inputs = tokenizer.encode(sentence, return_tensors="pt", max_length=max_length, truncation=True)
         outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
         translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
         translated_text.append(translated_sentence)
-    # Join the translated sentences back into a single string
     return " ".join(translated_text)
-# Gradio Interface setup
-# Ensure Gradio is installed
-# Importing Gradio
-import gradio as gr
-# Define the Gradio interface
 def gradio_translate(text):
     return translate(text, model, tokenizer)
@@ -45,5 +35,4 @@ iface = gr.Interface(fn=gradio_translate,
                      title="Faroese to English Translator",
                      description="Translate Faroese text to English using a state-of-the-art model.")
-# Launch the interface
 iface.launch()

+import nltk
+from nltk.tokenize import sent_tokenize
+nltk.download('punkt')  # Required to use the sentence tokenizer
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+import gradio as gr
+# Load the model and tokenizer
 model_checkpoint_path = "barbaroo/nllb_200_600M_fo_en"
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_path)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_path)
 def split_into_sentences(text):
+    # Using NLTK's sent_tokenize to handle various sentence terminations more effectively
+    return sent_tokenize(text)
 def translate(text, model, tokenizer, max_length=80):
     sentences = split_into_sentences(text)
     translated_text = []
     for sentence in sentences:
         inputs = tokenizer.encode(sentence, return_tensors="pt", max_length=max_length, truncation=True)
         outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
         translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
         translated_text.append(translated_sentence)
     return " ".join(translated_text)
 def gradio_translate(text):
     return translate(text, model, tokenizer)
                      title="Faroese to English Translator",
                      description="Translate Faroese text to English using a state-of-the-art model.")
 iface.launch()