import gradio as gr import spaces from huggingface_hub import hf_hub_download from llama_cpp import Llama MAX_INPUT_LIMIT = 3584 MAX_NEW_TOKENS = 1536 MODEL_REPO = "Azure99/blossom-v5.1-34b-gguf" MODEL_FILE = "model-q6_k.gguf" MODEL_LOCAL_DIR = "./" hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_LOCAL_DIR ) llm: Llama = None def get_input_ids(input, history): prefix = "A chat between a human and an artificial intelligence bot. The bot gives helpful, detailed, and polite answers to the human's questions." patterns = [] for conv in history: patterns.append(f'\n|Human|: {conv[0]}\n|Bot|: ') patterns.append(f'{conv[1]}') patterns.append(f'\n|Human|: {input}\n|Bot|: ') patterns[0] = prefix + patterns[0] input_ids = [] for i, pattern in enumerate(patterns): input_ids += llm.tokenize(pattern.encode("utf-8"), add_bos=False, special=True) if i % 2 == 1: input_ids += [llm.token_eos()] return input_ids @spaces.GPU def chat(inst, history, temperature, top_p, repetition_penalty): global llm if llm is None: llm = Llama(model_path=MODEL_FILE, n_gpu_layers=-1, flash_attn=True, offload_kqv=True, n_ctx=4096) input_ids = get_input_ids(inst, history) if len(input_ids) > MAX_INPUT_LIMIT: yield "The input is too long, please clear the history." return generate_config = dict(temperature=temperature, top_p=top_p, repeat_penalty=repetition_penalty, top_k=50, stream=True, max_tokens=1024) outputs = "" for chunk in llm(input_ids, **generate_config): outputs += chunk["choices"][0]["text"] yield outputs additional_inputs = [ gr.Slider( label="Temperature", value=0.5, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Controls randomness in choosing words.", ), gr.Slider( label="Top-P", value=0.85, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Picks words until their combined probability is at least top_p.", ), gr.Slider( label="Repetition penalty", value=1.05, minimum=1.0, maximum=1.2, step=0.01, interactive=True, info="Repetition Penalty: Controls how much repetition is penalized.", ) ] gr.ChatInterface(chat, chatbot=gr.Chatbot(show_label=False, height=500, show_copy_button=True, render_markdown=True), textbox=gr.Textbox(placeholder="", container=False, scale=7), title="Blossom 34B Demo", description='Hello, I am Blossom, an open source conversational large language model.🌠' 'GitHub', theme="soft", examples=[["Hello"], ["What is MBTI"], ["用Python实现二分查找"], ["为switch写一篇小红书种草文案,带上emoji"]], cache_examples=False, additional_inputs=additional_inputs, additional_inputs_accordion=gr.Accordion(label="Config", open=True), clear_btn="🗑️Clear", undo_btn="↩️Undo", retry_btn="🔄Retry", submit_btn="➡️Submit", ).queue().launch()