Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,620 Bytes
49c9656 7dd32f0 49c9656 7dd32f0 49c9656 7dd32f0 49c9656 7dd32f0 49c9656 7dd32f0 49c9656 7dd32f0 49c9656 7dd32f0 49c9656 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import gradio as gr
import spaces
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from transformers import AutoTokenizer
MAX_INPUT_LIMIT = 3584
MAX_NEW_TOKENS = 1536
MODEL_HF = "Azure99/blossom-v5.1-34b"
MODEL_REPO = "Azure99/blossom-v5.1-34b-gguf"
MODEL_FILE = "model-q6_k.gguf"
MODEL_LOCAL_DIR = "./"
hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILE,
local_dir=MODEL_LOCAL_DIR
)
llm: Llama = None
tokenizer = AutoTokenizer.from_pretrained(MODEL_HF)
def get_input_ids(inst, history):
prefix = ("A chat between a human and an artificial intelligence bot. "
"The bot gives helpful, detailed, and polite answers to the human's questions.")
patterns = []
for conv in history:
patterns.append(f'\n|Human|: {conv[0]}\n|Bot|: ')
patterns.append(f'{conv[1]}')
patterns.append(f'\n|Human|: {inst}\n|Bot|: ')
patterns[0] = prefix + patterns[0]
input_ids = []
for i, pattern in enumerate(patterns):
input_ids += tokenizer.encode(pattern, add_special_tokens=(i == 0))
if i % 2 == 1:
input_ids += [tokenizer.eos_token_id]
return input_ids
@spaces.GPU
def chat(inst, history, temperature, top_p, repetition_penalty):
global llm
if llm is None:
llm = Llama(model_path=MODEL_FILE, n_gpu_layers=-1, flash_attn=True, offload_kqv=True, n_ctx=4096)
input_ids = get_input_ids(inst, history)
if len(input_ids) > MAX_INPUT_LIMIT:
yield "The input is too long, please clear the history."
return
generate_config = dict(temperature=temperature, top_p=top_p, repeat_penalty=repetition_penalty,
top_k=50, stream=True, max_tokens=1024)
outputs = ""
for chunk in llm(input_ids, **generate_config):
outputs += chunk["choices"][0]["text"]
yield outputs
additional_inputs = [
gr.Slider(
label="Temperature",
value=0.5,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Controls randomness in choosing words.",
),
gr.Slider(
label="Top-P",
value=0.85,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Picks words until their combined probability is at least top_p.",
),
gr.Slider(
label="Repetition penalty",
value=1.05,
minimum=1.0,
maximum=1.2,
step=0.01,
interactive=True,
info="Repetition Penalty: Controls how much repetition is penalized.",
)
]
gr.ChatInterface(chat,
chatbot=gr.Chatbot(show_label=False, height=500, show_copy_button=True, render_markdown=True),
textbox=gr.Textbox(placeholder="", container=False, scale=7),
title="Blossom 34B Demo",
description='Hello, I am Blossom, an open source conversational large language model.🌠'
'<a href="https://github.com/Azure99/BlossomLM">GitHub</a>',
theme="soft",
examples=[["Hello"], ["What is MBTI"], ["用Python实现二分查找"],
["为switch写一篇小红书种草文案,带上emoji"]],
cache_examples=False,
additional_inputs=additional_inputs,
additional_inputs_accordion=gr.Accordion(label="Config", open=True),
clear_btn="🗑️Clear",
undo_btn="↩️Undo",
retry_btn="🔄Retry",
submit_btn="➡️Submit",
).queue().launch()
|