File size: 8,157 Bytes

b931852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dac6dcf
b931852
dac6dcf
b931852
 
dac6dcf
b931852
dac6dcf
b931852
dac6dcf
b931852
 
dac6dcf
 
 
8e0b125
dac6dcf
8e0b125
 
dac6dcf
8e0b125
dac6dcf
8e0b125
dac6dcf
8e0b125
 
dac6dcf
 
 
8e0b125
b931852

# -*- coding:utf-8 -*-
import os
import logging
import gradio as gr
import gc
from interface.hddr_llama_onnx_interface import LlamaOnnxInterface
from interface.empty_stub_interface import EmptyStubInterface
from ChatApp.app_modules.utils import (
    reset_textbox,
    transfer_input,
    reset_state,
    delete_last_conversation,
    cancel_outputing,
)
from ChatApp.app_modules.presets import (
    small_and_beautiful_theme,
    title,
    description_top,
    description,
)
from ChatApp.app_modules.overwrites import postprocess

logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
)

# we can filter this dictionary at the start according to the actual available files on disk
empty_stub_model_name = "_Empty Stub_"

top_directory = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))

tokenizer_path = os.path.join(top_directory, "tokenizer.model")

available_models = {
    "Llama-2 7B Chat Float16": {
        "onnx_file": os.path.join(
            top_directory, "FP16-Chat", "LlamaV2_7B_FT_float16.onnx"
        ),
        "tokenizer_path": tokenizer_path,
        "embedding_file": os.path.join(top_directory, "embeddings-chat.pth"),
    },
    "Llama-2 7B Chat Float32": {
        "onnx_file": os.path.join(
            top_directory, "FP32-Chat", "LlamaV2_7B_FT_float32.onnx"
        ),
        "tokenizer_path": tokenizer_path,
        "embedding_file": os.path.join(top_directory, "embeddings-chat.pth"),
    },    
    "Llama-2 7B Float16": {
        "onnx_file": os.path.join(
            top_directory, "FP16", "LlamaV2_7B_float16.onnx"
        ),
        "tokenizer_path": tokenizer_path,
        "embedding_file": os.path.join(top_directory, "embeddings.pth"),
    },
    "Llama-2 7B Float32": {
        "onnx_file": os.path.join(
            top_directory, "FP32", "LlamaV2_7B_float32.onnx"
        ),
        "tokenizer_path": tokenizer_path,
        "embedding_file": os.path.join(
            top_directory, "embeddings.pth"
        ),
    },
}


interface = EmptyStubInterface()
interface.initialize()

# interface = None

gr.Chatbot.postprocess = postprocess

with open("ChatApp/assets/custom.css", "r", encoding="utf-8") as f:
    custom_css = f.read()


def change_model_listener(new_model_name):
    if new_model_name is None:
        new_model_name = empty_stub_model_name

    global interface

    # if a model exists - shut it down before trying to create the new one
    if interface is not None:
        interface.shutdown()
        del interface
        gc.collect()

    logging.info(f"Creating a new model [{new_model_name}]")
    if new_model_name == empty_stub_model_name:
        interface = EmptyStubInterface()
        interface.initialize()
    else:
        d = available_models[new_model_name]
        interface = LlamaOnnxInterface(
            onnx_file=d["onnx_file"],
            tokenizer_path=d["tokenizer_path"],
            embedding_file=d["embedding_file"],
        )
        interface.initialize()

    return new_model_name


def interface_predict(*args):
    global interface
    res = interface.predict(*args)

    for x in res:
        yield x


def interface_retry(*args):
    global interface
    res = interface.retry(*args)

    for x in res:
        yield x


with gr.Blocks(css=custom_css, theme=small_and_beautiful_theme) as demo:
    history = gr.State([])
    user_question = gr.State("")
    with gr.Row():
        gr.HTML(title)
        status_display = gr.Markdown("Success", elem_id="status_display")
    gr.Markdown(description_top)

    with gr.Row():
        with gr.Column(scale=5):
            with gr.Row():
                chatbot = gr.Chatbot(elem_id="chuanhu_chatbot", height=900)
            with gr.Row():
                with gr.Column(scale=12):
                    user_input = gr.Textbox(show_label=False, placeholder="Enter text")
                with gr.Column(min_width=70, scale=1):
                    submit_button = gr.Button("Send")
                with gr.Column(min_width=70, scale=1):
                    cancel_button = gr.Button("Stop")
            with gr.Row():
                empty_button = gr.Button(
                    "🧹 New Conversation",
                )
                retry_button = gr.Button("🔄 Regenerate")
                delete_last_button = gr.Button("🗑️ Remove Last Turn")
        with gr.Column():
            with gr.Column(min_width=50, scale=1):
                with gr.Tab(label="Parameter Setting"):
                    gr.Markdown("# Model")
                    model_name = gr.Dropdown(
                        choices=[empty_stub_model_name] + list(available_models.keys()),
                        label="Model",
                        show_label=False,  # default="Empty STUB",
                    )
                    model_name.change(
                        change_model_listener, inputs=[model_name], outputs=[model_name]
                    )

                    gr.Markdown("# Parameters")
                    top_p = gr.Slider(
                        minimum=-0,
                        maximum=1.0,
                        value=0.9,
                        step=0.05,
                        interactive=True,
                        label="Top-p",
                    )
                    temperature = gr.Slider(
                        minimum=0.1,
                        maximum=2.0,
                        value=0.75,
                        step=0.1,
                        interactive=True,
                        label="Temperature",
                    )
                    max_length_tokens = gr.Slider(
                        minimum=0,
                        maximum=512,
                        value=256,
                        step=8,
                        interactive=True,
                        label="Max Generation Tokens",
                    )
                    max_context_length_tokens = gr.Slider(
                        minimum=0,
                        maximum=4096,
                        value=2048,
                        step=128,
                        interactive=True,
                        label="Max History Tokens",
                    )
    gr.Markdown(description)

    predict_args = dict(
        # fn=interface.predict,
        fn=interface_predict,
        inputs=[
            user_question,
            chatbot,
            history,
            top_p,
            temperature,
            max_length_tokens,
            max_context_length_tokens,
        ],
        outputs=[chatbot, history, status_display],
        show_progress=True,
    )
    retry_args = dict(
        fn=interface_retry,
        inputs=[
            user_input,
            chatbot,
            history,
            top_p,
            temperature,
            max_length_tokens,
            max_context_length_tokens,
        ],
        outputs=[chatbot, history, status_display],
        show_progress=True,
    )

    reset_args = dict(fn=reset_textbox, inputs=[], outputs=[user_input, status_display])

    # Chatbot
    transfer_input_args = dict(
        fn=transfer_input,
        inputs=[user_input],
        outputs=[user_question, user_input, submit_button],
        show_progress=True,
    )

    predict_event1 = user_input.submit(**transfer_input_args).then(**predict_args)

    predict_event2 = submit_button.click(**transfer_input_args).then(**predict_args)

    empty_button.click(
        reset_state,
        outputs=[chatbot, history, status_display],
        show_progress=True,
    )
    empty_button.click(**reset_args)

    predict_event3 = retry_button.click(**retry_args)

    delete_last_button.click(
        delete_last_conversation,
        [chatbot, history],
        [chatbot, history, status_display],
        show_progress=True,
    )
    cancel_button.click(
        cancel_outputing,
        [],
        [status_display],
        cancels=[predict_event1, predict_event2, predict_event3],
    )

    demo.load(change_model_listener, inputs=None, outputs=model_name)

demo.title = "Llama-2 Chat UI"

demo.queue(concurrency_count=1).launch()