File size: 4,303 Bytes
acc5ceb
c91f330
 
8b08dff
 
6a7f812
8b08dff
c91f330
 
e7c4cd7
c939e8c
 
 
79787d6
 
c939e8c
 
 
 
 
 
e7c4cd7
79787d6
 
 
 
 
 
9c1ca18
 
 
0e80915
9c1ca18
 
79787d6
 
 
 
9c1ca18
 
 
ffac277
8b08dff
ffac277
c7ddbe5
9c1ca18
79787d6
0e80915
 
ffac277
 
9c1ca18
ffac277
0e80915
6a7f812
0e80915
 
 
9c1ca18
0e80915
 
 
 
ffac277
 
 
e7c4cd7
c939e8c
c91f330
 
 
c939e8c
 
9c1ca18
 
79787d6
9c1ca18
 
79787d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c939e8c
c91f330
 
9c1ca18
c91f330
 
 
 
c939e8c
 
 
 
 
79787d6
c939e8c
 
79787d6
 
c939e8c
e7c4cd7
c91f330
0e80915
9c1ca18
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import spaces
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
import torch

# Initialize and download necessary NLTK resources
nltk.download('punkt')

# Load the models and tokenizers
model_checkpoint_fo_en = "barbaroo/nllb_200_600M_fo_en"
model_checkpoint_en_fo = "barbaroo/nllb_200_600M_en_fo"
model_checkpoint_uk_en = "Helsinki-NLP/opus-mt-uk-en"
model_checkpoint_en_uk = "Helsinki-NLP/opus-mt-en-uk"

tokenizer_fo_en = AutoTokenizer.from_pretrained(model_checkpoint_fo_en)
model_fo_en = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_fo_en)

tokenizer_en_fo = AutoTokenizer.from_pretrained(model_checkpoint_en_fo)
model_en_fo = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_en_fo)

tokenizer_uk_en = AutoTokenizer.from_pretrained(model_checkpoint_uk_en)
model_uk_en = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_uk_en)

tokenizer_en_uk = AutoTokenizer.from_pretrained(model_checkpoint_en_uk)
model_en_uk = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_en_uk)

# Check if a GPU is available and move models to GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print("GPU is available. Initializing models on GPU.")
    model_fo_en.to(device)
    model_en_fo.to(device)
    model_uk_en.to(device)
    model_en_uk.to(device)
else:
    print("GPU is not available. Using CPU.")

def split_into_sentences(text):
    return sent_tokenize(text)

@spaces.GPU
def translate(text, model, tokenizer, max_length=80):
    # Ensure model is on the correct device
    model.to(device)
    
    sentences = split_into_sentences(text)
    translated_text = []

    for sentence in sentences:
        # Move inputs to the correct device
        inputs = tokenizer.encode(sentence, return_tensors="pt", max_length=max_length, truncation=True).to(device)
        print(f"Input tensor device: {inputs.device}")  # Debug statement
        
        # Model inference on the GPU
        outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
        print(f"Output tensor device: {outputs.device}")  # Debug statement
        
        # Move outputs back to CPU for decoding
        translated_sentence = tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)
        translated_text.append(translated_sentence)

    return " ".join(translated_text)

def handle_input(text, file, direction):
    if file is not None:
        # Decode the file bytes directly
        text = file.decode("utf-8")
    
    if direction == "fo_en":
        model = model_fo_en
        tokenizer = tokenizer_fo_en
    elif direction == "en_fo":
        model = model_en_fo
        tokenizer = tokenizer_en_fo
    elif direction == "uk_en":
        model = model_uk_en
        tokenizer = tokenizer_uk_en
    elif direction == "en_uk":
        model = model_en_uk
        tokenizer = tokenizer_en_uk
    elif direction == "uk_fo":
        # Ukrainian to Faroese via English pivot
        model = model_uk_en
        tokenizer = tokenizer_uk_en
        text = translate(text, model, tokenizer)
        model = model_en_fo
        tokenizer = tokenizer_en_fo
    elif direction == "fo_uk":
        # Faroese to Ukrainian via English pivot
        model = model_fo_en
        tokenizer = tokenizer_fo_en
        text = translate(text, model, tokenizer)
        model = model_en_uk
        tokenizer = tokenizer_en_uk
    
    # Translate the text if it's not empty
    if text:
        return translate(text, model, tokenizer)
    else:
        return "Please enter text or upload a text file."

# Define the Gradio interface
iface = gr.Interface(
    fn=handle_input, 
    inputs=[
        gr.Textbox(lines=2, placeholder="Type here or upload a text file..."), 
        gr.File(label="or Upload Text File", type="binary"),
        gr.Dropdown(label="Translation Direction", choices=["fo_en", "en_fo", "uk_en", "en_uk", "uk_fo", "fo_uk"], value="fo_en")
    ],
    outputs="text",
    title="Multilingual Translator",
    description="Enter text directly or upload a text file (.txt) to translate between Faroese, Ukrainian, and English."
)

# Launch the interface
iface.launch()