import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama model = None def load_model(): global model try: print("Начинаем загрузку модели из Hub...") model_path = hf_hub_download( repo_id="AugustLight/LLight-3.2-3B-Instruct", filename="Llight.Q8_0.gguf", repo_type="model" ) print(f"Модель загружена в: {model_path}") model = Llama( model_path=model_path, n_ctx=2048, n_threads=4, n_batch=512 ) print("Модель успешно инициализирована!") return model except Exception as e: print(f"Подробная ошибка при загрузке модели: {str(e)}") raise e def respond(message, history, system_message, max_new_tokens, temperature, top_p): try: global model if model is None: model = load_model() context = f"{system_message}\n\n" for user_msg, assistant_msg in history: context += f"User: {user_msg}\nAssistant: {assistant_msg}\n" context += f"User: {message}\nAssistant: " print(f"Генерируем ответ для контекста длиной {len(context)} символов") response_text = "" # Используем генерацию с потоком for response in model( prompt=context, max_tokens=max_new_tokens, temperature=temperature, top_p=top_p, stop=["User:", "\n\n", "<|endoftext|>"], echo=False, stream=True ): chunk = response['choices'][0]['text'] response_text += chunk print(f"Промежуточный ответ: {chunk}") yield response_text # Отправляем накопленный текст print("Ответ сгенерирован полностью.") except Exception as e: error_msg = f"Произошла ошибка: {str(e)}" print(error_msg) yield error_msg with gr.Blocks() as demo: chatbot = gr.Chatbot() msg = gr.Textbox(label="Сообщение") with gr.Accordion("Параметры", open=False): system = gr.Textbox( value="Ты дружелюбный и полезный ассистент. Отвечай обдуманно и по делу.", label="System message" ) max_new_tokens = gr.Slider( minimum=1, maximum=2048, value=512, step=1, label="Max new tokens" ) temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.3, step=0.1, label="Temperature" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)" ) clear = gr.Button("Очистить") def user(user_message, history): return "", history + [[user_message, None]] def bot(history, system_message, max_new_tokens, temperature, top_p): message = history[-1][0] for response_text in respond(message, history[:-1], system_message, max_new_tokens, temperature, top_p): history[-1][1] = response_text yield history msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( bot, [chatbot, system, max_new_tokens, temperature, top_p], chatbot ) clear.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": try: print("Инициализация приложения...") model = load_model() print("Модель загружена успешно при старте") except Exception as e: print(f"Ошибка при инициализации: {str(e)}") demo.launch()