Spaces:
Sleeping
Sleeping
import gradio as gr | |
from huggingface_hub import hf_hub_download | |
from llama_cpp import Llama | |
model = None | |
def load_model(): | |
global model | |
try: | |
print("Начинаем загрузку модели из Hub...") | |
model_path = hf_hub_download( | |
repo_id="AugustLight/LLight-3.2-3B-Instruct", | |
filename="Llight.Q8_0.gguf", | |
repo_type="model" | |
) | |
print(f"Модель загружена в: {model_path}") | |
model = Llama( | |
model_path=model_path, | |
n_ctx=2048, | |
n_threads=4, | |
n_batch=512 | |
) | |
print("Модель успешно инициализирована!") | |
return model | |
except Exception as e: | |
print(f"Подробная ошибка при загрузке модели: {str(e)}") | |
raise e | |
def respond(message, history, system_message, max_new_tokens, temperature, top_p): | |
try: | |
global model | |
if model is None: | |
model = load_model() | |
context = f"{system_message}\n\n" | |
for user_msg, assistant_msg in history: | |
context += f"User: {user_msg}\nAssistant: {assistant_msg}\n" | |
context += f"User: {message}\nAssistant: " | |
print(f"Генерируем ответ для контекста длиной {len(context)} символов") | |
response_text = "" | |
# Используем генерацию с потоком | |
for response in model( | |
prompt=context, | |
max_tokens=max_new_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
stop=["User:", "\n\n", "<|endoftext|>"], | |
echo=False, | |
stream=True | |
): | |
chunk = response['choices'][0]['text'] | |
response_text += chunk | |
print(f"Промежуточный ответ: {chunk}") | |
yield response_text # Отправляем накопленный текст | |
print("Ответ сгенерирован полностью.") | |
except Exception as e: | |
error_msg = f"Произошла ошибка: {str(e)}" | |
print(error_msg) | |
yield error_msg | |
with gr.Blocks() as demo: | |
chatbot = gr.Chatbot() | |
msg = gr.Textbox(label="Сообщение") | |
with gr.Accordion("Параметры", open=False): | |
system = gr.Textbox( | |
value="Ты дружелюбный и полезный ассистент. Отвечай обдуманно и по делу.", | |
label="System message" | |
) | |
max_new_tokens = gr.Slider( | |
minimum=1, | |
maximum=2048, | |
value=512, | |
step=1, | |
label="Max new tokens" | |
) | |
temperature = gr.Slider( | |
minimum=0.1, | |
maximum=2.0, | |
value=0.3, | |
step=0.1, | |
label="Temperature" | |
) | |
top_p = gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.95, | |
step=0.05, | |
label="Top-p (nucleus sampling)" | |
) | |
clear = gr.Button("Очистить") | |
def user(user_message, history): | |
return "", history + [[user_message, None]] | |
def bot(history, system_message, max_new_tokens, temperature, top_p): | |
message = history[-1][0] | |
for response_text in respond(message, history[:-1], system_message, max_new_tokens, temperature, top_p): | |
history[-1][1] = response_text | |
yield history | |
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( | |
bot, [chatbot, system, max_new_tokens, temperature, top_p], chatbot | |
) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
if __name__ == "__main__": | |
try: | |
print("Инициализация приложения...") | |
model = load_model() | |
print("Модель загружена успешно при старте") | |
except Exception as e: | |
print(f"Ошибка при инициализации: {str(e)}") | |
demo.launch() |