File size: 2,843 Bytes
343c3c7
 
c91f330
 
8b08dff
 
6a7f812
8b08dff
c91f330
 
e7c4cd7
c939e8c
 
 
 
 
 
 
 
 
e7c4cd7
6a7f812
 
 
 
 
 
 
 
 
 
ffac277
8b08dff
ffac277
 
 
 
8b08dff
ffac277
6a7f812
ffac277
 
 
 
 
e7c4cd7
c939e8c
c91f330
 
 
c939e8c
 
 
 
 
 
 
 
c91f330
 
 
 
 
 
 
c939e8c
 
 
 
 
 
 
 
 
 
 
e7c4cd7
c91f330
 
c939e8c
343c3c7
 
 
be8987f
e311694
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87

'''
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
import torch

# Initialize and download necessary NLTK resources
nltk.download('punkt')

# Load the models and tokenizers
model_checkpoint_fo_en = "barbaroo/nllb_200_600M_fo_en"
model_checkpoint_en_fo = "barbaroo/nllb_200_600M_en_fo"

tokenizer_fo_en = AutoTokenizer.from_pretrained(model_checkpoint_fo_en)
model_fo_en = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_fo_en)

tokenizer_en_fo = AutoTokenizer.from_pretrained(model_checkpoint_en_fo)
model_en_fo = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_en_fo)

# Check if a GPU is available and move models to GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print("GPU is available. Initializing models on GPU.")
    model_fo_en.to(device)
    model_en_fo.to(device)
else:
    print("GPU is not available. Using CPU.")

def split_into_sentences(text):
    return sent_tokenize(text)

def translate(text, model, tokenizer, max_length=80):
    sentences = split_into_sentences(text)
    translated_text = []
    
    for sentence in sentences:
        inputs = tokenizer.encode(sentence, return_tensors="pt", max_length=max_length, truncation=True).to(device)
        outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
        translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translated_text.append(translated_sentence)

    return " ".join(translated_text)

def handle_input(text, file, direction):
    if file is not None:
        # Decode the file bytes directly
        text = file.decode("utf-8")
    
    if direction == "fo_en":
        model = model_fo_en
        tokenizer = tokenizer_fo_en
    else:
        model = model_en_fo
        tokenizer = tokenizer_en_fo
    
    # Translate the text if it's not empty
    if text:
        return translate(text, model, tokenizer)
    else:
        return "Please enter text or upload a text file."

# Define the Gradio interface
iface = gr.Interface(
    fn=handle_input, 
    inputs=[
        gr.Textbox(lines=2, placeholder="Type here or upload a text file..."), 
        gr.File(label="or Upload Text File", type="binary"),
        gr.Dropdown(label="Translation Direction", choices=["fo_en", "en_fo"], value="fo_en")
    ],
    outputs="text",
    title="Bidirectional Translator",
    description="Enter text directly or upload a text file (.txt) to translate between Faroese and English."
)

# Launch the interface
iface.launch()

'''

import torch

print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")