barbaroo commited on
Commit
8b08dff
1 Parent(s): 27eb948

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -20
app.py CHANGED
@@ -1,41 +1,31 @@
 
 
 
 
1
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
2
- import re
3
 
4
- # Path to your model's checkpoints
5
  model_checkpoint_path = "barbaroo/nllb_200_600M_fo_en"
6
-
7
- # Load the tokenizer and model
8
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_path)
9
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_path)
10
 
11
  def split_into_sentences(text):
12
- # This simple function splits text into sentences using regular expressions
13
- # that capture punctuation marks followed by space and a capital letter.
14
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
15
- return sentences
16
 
17
  def translate(text, model, tokenizer, max_length=80):
18
- # Split long text into sentences
19
  sentences = split_into_sentences(text)
20
  translated_text = []
21
-
22
- # Process each sentence separately
23
  for sentence in sentences:
24
  inputs = tokenizer.encode(sentence, return_tensors="pt", max_length=max_length, truncation=True)
25
  outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
26
  translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
27
  translated_text.append(translated_sentence)
28
 
29
- # Join the translated sentences back into a single string
30
  return " ".join(translated_text)
31
 
32
- # Gradio Interface setup
33
- # Ensure Gradio is installed
34
-
35
- # Importing Gradio
36
- import gradio as gr
37
-
38
- # Define the Gradio interface
39
  def gradio_translate(text):
40
  return translate(text, model, tokenizer)
41
 
@@ -45,5 +35,4 @@ iface = gr.Interface(fn=gradio_translate,
45
  title="Faroese to English Translator",
46
  description="Translate Faroese text to English using a state-of-the-art model.")
47
 
48
- # Launch the interface
49
  iface.launch()
 
1
+ import nltk
2
+ from nltk.tokenize import sent_tokenize
3
+ nltk.download('punkt') # Required to use the sentence tokenizer
4
+
5
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
6
+ import gradio as gr
7
 
8
+ # Load the model and tokenizer
9
  model_checkpoint_path = "barbaroo/nllb_200_600M_fo_en"
 
 
10
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_path)
11
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_path)
12
 
13
  def split_into_sentences(text):
14
+ # Using NLTK's sent_tokenize to handle various sentence terminations more effectively
15
+ return sent_tokenize(text)
 
 
16
 
17
  def translate(text, model, tokenizer, max_length=80):
 
18
  sentences = split_into_sentences(text)
19
  translated_text = []
20
+
 
21
  for sentence in sentences:
22
  inputs = tokenizer.encode(sentence, return_tensors="pt", max_length=max_length, truncation=True)
23
  outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
24
  translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
25
  translated_text.append(translated_sentence)
26
 
 
27
  return " ".join(translated_text)
28
 
 
 
 
 
 
 
 
29
  def gradio_translate(text):
30
  return translate(text, model, tokenizer)
31
 
 
35
  title="Faroese to English Translator",
36
  description="Translate Faroese text to English using a state-of-the-art model.")
37
 
 
38
  iface.launch()