import streamlit as st from transformers import AutoModelForSeq2SeqLM, NllbTokenizer translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian' translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name) translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name) def split_into_chunks(text, tokenizer, max_length=150): tokens = tokenizer.tokenize(text) chunks = [] current_chunk = [] current_length = 0 for token in tokens: current_chunk.append(token) current_length += 1 if current_length >= max_length: chunks.append(tokenizer.convert_tokens_to_string(current_chunk)) current_chunk = [] current_length = 0 if current_chunk: chunks.append(tokenizer.convert_tokens_to_string(current_chunk)) return chunks def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl'): tokenizer.src_lang = src_lang tokenizer.tgt_lang = tgt_lang chunks = split_into_chunks(text, tokenizer) translated_chunks = [] for chunk in chunks: inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128) outputs = model.generate(inputs['input_ids']) translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True)) return ' '.join(translated_chunks) st.markdown(""" """, unsafe_allow_html=True) st.sidebar.markdown('## Навигация') uploaded_file = st.sidebar.file_uploader("Загрузите текстовый файл...", type=["txt"]) process_btn = False if uploaded_file: st.sidebar.text("Файл загружен") process_btn = st.sidebar.button("Перевести") st.markdown('

Перевод текста

', unsafe_allow_html=True) st.markdown('
Перевод с узбекского на русский
', unsafe_allow_html=True) if process_btn and uploaded_file: uploaded_text = uploaded_file.read().decode('utf-8') st.text_area("Исходный текст", uploaded_text, height=150) with st.spinner('Переводим...'): translated_text = translate(uploaded_text, translation_model, translation_tokenizer) st.text_area("Переведенный текст (на русском):", value=translated_text, height=200)