import spaces
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import gradio as gr

from utils.chatbot_local import ChatBot

MODEL_PATH = 'lora_adapter'

model = AutoPeftModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

chatbot = ChatBot(model=model, tokenizer=tokenizer)

PLACEHOLDER = """
<center>
<p>Hi! How can I help you today?</p>
</center>
"""

CSS = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
"""


@spaces.GPU()
def stream_chat(
        message: str,
        history: list,
):
    print(f'message: {message}')
    print(f'history: {history}')

    conversation = [
        {"role": "system", "content": "You are a helpful assistant."}
    ]
    for prompt, answer in history:
        conversation.extend([
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": answer},
        ])

    conversation.append({"role": "user", "content": message})

    buffer = ""
    for token in chatbot.chat(messages=conversation, stream=True):
        buffer += token
        yield buffer


gr_chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)

with gr.Blocks(css=CSS, theme="soft") as demo:
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=gr_chatbot,
        fill_height=True,
    )

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        share=False,
        debug=True,
    )