Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer | |
translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian' | |
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name) | |
translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name) | |
def split_into_chunks(text, tokenizer, max_length=150): | |
tokens = tokenizer.tokenize(text) | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
for token in tokens: | |
current_chunk.append(token) | |
current_length += 1 | |
if current_length >= max_length: | |
chunks.append(tokenizer.convert_tokens_to_string(current_chunk)) | |
current_chunk = [] | |
current_length = 0 | |
if current_chunk: | |
chunks.append(tokenizer.convert_tokens_to_string(current_chunk)) | |
return chunks | |
def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl'): | |
tokenizer.src_lang = src_lang | |
tokenizer.tgt_lang = tgt_lang | |
chunks = split_into_chunks(text, tokenizer) | |
translated_chunks = [] | |
for chunk in chunks: | |
inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128) | |
outputs = model.generate(inputs['input_ids']) | |
translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
return ' '.join(translated_chunks) | |
st.markdown(""" | |
<style> | |
.big-font { | |
font-size:30px !important; | |
font-weight: bold; | |
} | |
.small-font { | |
font-size:18px !important; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.sidebar.markdown('## Навигация') | |
uploaded_file = st.sidebar.file_uploader("Загрузите текстовый файл...", type=["txt"]) | |
process_btn = False | |
if uploaded_file: | |
st.sidebar.text("Файл загружен") | |
process_btn = st.sidebar.button("Перевести") | |
st.markdown('<h1 class="big-font">Перевод текста</h1>', unsafe_allow_html=True) | |
st.markdown('<div class="big-font">Перевод с узбекского на русский</div>', unsafe_allow_html=True) | |
if process_btn and uploaded_file: | |
uploaded_text = uploaded_file.read().decode('utf-8') | |
st.text_area("Исходный текст", uploaded_text, height=150) | |
with st.spinner('Переводим...'): | |
translated_text = translate(uploaded_text, translation_model, translation_tokenizer) | |
st.text_area("Переведенный текст (на русском):", value=translated_text, height=200) | |