Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,41 +1,31 @@
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
2 |
-
import
|
3 |
|
4 |
-
#
|
5 |
model_checkpoint_path = "barbaroo/nllb_200_600M_fo_en"
|
6 |
-
|
7 |
-
# Load the tokenizer and model
|
8 |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_path)
|
9 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_path)
|
10 |
|
11 |
def split_into_sentences(text):
|
12 |
-
#
|
13 |
-
|
14 |
-
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
|
15 |
-
return sentences
|
16 |
|
17 |
def translate(text, model, tokenizer, max_length=80):
|
18 |
-
# Split long text into sentences
|
19 |
sentences = split_into_sentences(text)
|
20 |
translated_text = []
|
21 |
-
|
22 |
-
# Process each sentence separately
|
23 |
for sentence in sentences:
|
24 |
inputs = tokenizer.encode(sentence, return_tensors="pt", max_length=max_length, truncation=True)
|
25 |
outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
|
26 |
translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
27 |
translated_text.append(translated_sentence)
|
28 |
|
29 |
-
# Join the translated sentences back into a single string
|
30 |
return " ".join(translated_text)
|
31 |
|
32 |
-
# Gradio Interface setup
|
33 |
-
# Ensure Gradio is installed
|
34 |
-
|
35 |
-
# Importing Gradio
|
36 |
-
import gradio as gr
|
37 |
-
|
38 |
-
# Define the Gradio interface
|
39 |
def gradio_translate(text):
|
40 |
return translate(text, model, tokenizer)
|
41 |
|
@@ -45,5 +35,4 @@ iface = gr.Interface(fn=gradio_translate,
|
|
45 |
title="Faroese to English Translator",
|
46 |
description="Translate Faroese text to English using a state-of-the-art model.")
|
47 |
|
48 |
-
# Launch the interface
|
49 |
iface.launch()
|
|
|
1 |
+
import nltk
|
2 |
+
from nltk.tokenize import sent_tokenize
|
3 |
+
nltk.download('punkt') # Required to use the sentence tokenizer
|
4 |
+
|
5 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
6 |
+
import gradio as gr
|
7 |
|
8 |
+
# Load the model and tokenizer
|
9 |
model_checkpoint_path = "barbaroo/nllb_200_600M_fo_en"
|
|
|
|
|
10 |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_path)
|
11 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_path)
|
12 |
|
13 |
def split_into_sentences(text):
|
14 |
+
# Using NLTK's sent_tokenize to handle various sentence terminations more effectively
|
15 |
+
return sent_tokenize(text)
|
|
|
|
|
16 |
|
17 |
def translate(text, model, tokenizer, max_length=80):
|
|
|
18 |
sentences = split_into_sentences(text)
|
19 |
translated_text = []
|
20 |
+
|
|
|
21 |
for sentence in sentences:
|
22 |
inputs = tokenizer.encode(sentence, return_tensors="pt", max_length=max_length, truncation=True)
|
23 |
outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
|
24 |
translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
25 |
translated_text.append(translated_sentence)
|
26 |
|
|
|
27 |
return " ".join(translated_text)
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def gradio_translate(text):
|
30 |
return translate(text, model, tokenizer)
|
31 |
|
|
|
35 |
title="Faroese to English Translator",
|
36 |
description="Translate Faroese text to English using a state-of-the-art model.")
|
37 |
|
|
|
38 |
iface.launch()
|