sarahai's picture
Update app.py
4dace66 verified
import streamlit as st
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian'
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name)
def split_into_chunks(text, tokenizer, max_length=150):
tokens = tokenizer.tokenize(text)
chunks = []
current_chunk = []
current_length = 0
for token in tokens:
current_chunk.append(token)
current_length += 1
if current_length >= max_length:
chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
current_chunk = []
current_length = 0
if current_chunk:
chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
return chunks
def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl'):
tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang
chunks = split_into_chunks(text, tokenizer)
translated_chunks = []
for chunk in chunks:
inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128)
outputs = model.generate(inputs['input_ids'])
translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
return ' '.join(translated_chunks)
st.markdown("""
<style>
.big-font {
font-size:30px !important;
font-weight: bold;
}
.small-font {
font-size:18px !important;
}
</style>
""", unsafe_allow_html=True)
st.sidebar.markdown('## Навигация')
uploaded_file = st.sidebar.file_uploader("Загрузите текстовый файл...", type=["txt"])
process_btn = False
if uploaded_file:
st.sidebar.text("Файл загружен")
process_btn = st.sidebar.button("Перевести")
st.markdown('<h1 class="big-font">Перевод текста</h1>', unsafe_allow_html=True)
st.markdown('<div class="big-font">Перевод с узбекского на русский</div>', unsafe_allow_html=True)
if process_btn and uploaded_file:
uploaded_text = uploaded_file.read().decode('utf-8')
st.text_area("Исходный текст", uploaded_text, height=150)
with st.spinner('Переводим...'):
translated_text = translate(uploaded_text, translation_model, translation_tokenizer)
st.text_area("Переведенный текст (на русском):", value=translated_text, height=200)