Distilling the Knowledge from LLM through Self Chatting

"""
"""
import gradio

import config
from app_util import *

system_list = [
    "You are a helpful assistant.",
    "你是一个导游。",
    "你是一个英语老师。",
    "你是一个程序员。",
    "你是一个心理咨询师。",
]

user_simulator_doc = """\
There are maily two types of user simulator:
- prompt-based user-simulator (role-play)
- model-based user-simulator

In most cases, large language models (LLMs) are used to serve as assistant generator.
Besides, it can also used as user simulator.
"""

survey = """\
## knowledge distillation 知识蒸馏

Essentially, it is a form of model compression.

## distilling knowledge != knowledge distillation

知识的形式可以是 QA纯文本，也可以是 QA+概率。

## 有不用概率的知识蒸馏吗？
"""

with gr.Blocks() as demo:
    # Knowledge Distillation through Self Chatting
    #
    gr.HTML("""<h1 align="center">Distilling the Knowledge from LLM through Self Chatting</h1>""")
    with gr.Row():
        with gr.Column(scale=5):
            system = gr.Dropdown(
                choices=system_list,
                value=system_list[0],
                allow_custom_value=True,
                interactive=True,
                label="System message",
                scale=5,
            )

            chatbot = gr.Chatbot(show_copy_button=True,
                                 show_share_button=True,
                                 avatar_images=("assets/man.png", "assets/bot.png"))

            with gradio.Tab("Self Chat"):
                generated_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
                generate_btn = gr.Button("🤔️ Self-Chat", variant="primary")
                with gr.Row():
                    retry_btn = gr.Button("🔄  Retry", variant="secondary", size="sm", )
                    undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm", )
                    clear_btn = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
                    # stop_btn = gr.Button("停止生成", variant="stop", visible=False)
                gr.Markdown(
                    "Self-chat is a demo, which makes the model talk to itself. "
                    "It is based on user simulator and response generator.",
                    visible=True)

            with gradio.Tab("Response Generator"):
                with gr.Row():
                    generated_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
                    generate_btn_2 = gr.Button("Send", variant="primary")
                with gr.Row():
                    retry_btn_2 = gr.Button("🔄  Regenerate", variant="secondary", size="sm", )
                    undo_btn_2 = gr.Button("↩️ Undo", variant="secondary", size="sm", )
                    clear_btn_2 = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
                gr.Markdown("Response simulator is the most commonly used chatbot.")

            with gradio.Tab("User Simulator"):
                with gr.Row():
                    generated_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
                    generate_btn_3 = gr.Button("Send", variant="primary")
                with gr.Row():
                    retry_btn_3 = gr.Button("🔄  Regenerate", variant="secondary", size="sm", )
                    undo_btn_3 = gr.Button("↩️ Undo", variant="secondary", size="sm", )
                    clear_btn_3 = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
                gr.Markdown(user_simulator_doc)

        with gr.Column(variant="compact"):
            # with gr.Column():
            model = gr.Dropdown(
                ["Qwen2-0.5B-Instruct", "llama3.1", "gemini"],
                value="Qwen2-0.5B-Instruct",
                label="Model",
                interactive=True,
                # visible=False
            )
            with gr.Accordion(label="Parameters", open=True):
                slider_max_tokens = gr.Slider(minimum=1, maximum=config.MAX_SEQUENCE_LENGTH,
                                              value=config.DEFAULT_MAX_TOKENS, step=1, label="Max tokens")
                slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
                                               value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
                                               info="Larger temperature increase the randomness")
                slider_top_p = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=config.DEFAULT_TOP_P,
                    step=0.05,
                    label="Top-p (nucleus sampling)",
                )
                slider_top_k = gr.Slider(
                    minimum=1,
                    maximum=200,
                    value=config.DEFAULT_TOP_K,
                    step=1,
                    label="Top-k",
                )

    ########
    history = gr.State([{"role": "system", "content": system_list[0]}])  # 有用信息只有个system，其他和chatbot内容重叠
    system.change(reset_state, inputs=[system], outputs=[chatbot, history])
    clear_btn.click(reset_state, inputs=[system], outputs=[chatbot, history])

    generate_btn.click(generate, [chatbot, history], outputs=[generated_text_1, chatbot, history],
                       show_progress="full")
    retry_btn.click(undo_generate, [chatbot, history], outputs=[generated_text_1, chatbot, history]) \
        .then(generate, [chatbot, history], outputs=[generated_text_1, chatbot, history],
              show_progress="full")
    undo_btn.click(undo_generate, [chatbot, history], outputs=[generated_text_1, chatbot, history])

    slider_max_tokens.change(set_max_tokens, inputs=[slider_max_tokens])
    slider_temperature.change(set_temperature, inputs=[slider_temperature])
    slider_top_p.change(set_top_p, inputs=[slider_top_p])
    slider_top_k.change(set_top_k, inputs=[slider_top_k])

# demo.queue().launch(share=False, server_name="0.0.0.0")
# demo.queue().launch(concurrency_count=1, max_size=5)
demo.queue().launch()