Azure99 commited on
Commit
49c9656
1 Parent(s): c797965

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ from huggingface_hub import hf_hub_download
4
+ from llama_cpp import Llama
5
+
6
+ MAX_INPUT_LIMIT = 3584
7
+ MAX_NEW_TOKENS = 1536
8
+ MODEL_REPO = "Azure99/blossom-v5.1-34b-gguf"
9
+ MODEL_FILE = "model-q6_k.gguf"
10
+ MODEL_LOCAL_DIR = "./"
11
+
12
+ hf_hub_download(
13
+ repo_id=MODEL_REPO,
14
+ filename=MODEL_FILE,
15
+ local_dir=MODEL_LOCAL_DIR
16
+ )
17
+
18
+ llm: Llama = None
19
+
20
+
21
+ def get_input_ids(input, history):
22
+ prefix = "A chat between a human and an artificial intelligence bot. The bot gives helpful, detailed, and polite answers to the human's questions."
23
+ patterns = []
24
+ for conv in history:
25
+ patterns.append(f'\n|Human|: {conv[0]}\n|Bot|: ')
26
+ patterns.append(f'{conv[1]}')
27
+ patterns.append(f'\n|Human|: {input}\n|Bot|: ')
28
+ patterns[0] = prefix + patterns[0]
29
+
30
+ input_ids = []
31
+ for i, pattern in enumerate(patterns):
32
+ input_ids += llm.tokenize(pattern.encode("utf-8"), add_bos=False, special=True)
33
+ if i % 2 == 1:
34
+ input_ids += [llm.token_eos()]
35
+ return input_ids
36
+
37
+
38
+ @spaces.GPU
39
+ def chat(inst, history, temperature, top_p, repetition_penalty):
40
+ global llm
41
+ if llm is None:
42
+ llm = Llama(model_path=MODEL_FILE, n_gpu_layers=-1, flash_attn=True, offload_kqv=True, n_ctx=4096)
43
+
44
+ input_ids = get_input_ids(inst, history)
45
+ if len(input_ids) > MAX_INPUT_LIMIT:
46
+ yield "The input is too long, please clear the history."
47
+ return
48
+
49
+ generate_config = dict(temperature=temperature, top_p=top_p, repeat_penalty=repetition_penalty,
50
+ top_k=50, stream=True, max_tokens=1024)
51
+
52
+ outputs = ""
53
+ for chunk in llm(input_ids, **generate_config):
54
+ outputs += chunk["choices"][0]["text"]
55
+ yield outputs
56
+
57
+
58
+ additional_inputs = [
59
+ gr.Slider(
60
+ label="Temperature",
61
+ value=0.5,
62
+ minimum=0.0,
63
+ maximum=1.0,
64
+ step=0.05,
65
+ interactive=True,
66
+ info="Controls randomness in choosing words.",
67
+ ),
68
+ gr.Slider(
69
+ label="Top-P",
70
+ value=0.85,
71
+ minimum=0.0,
72
+ maximum=1.0,
73
+ step=0.05,
74
+ interactive=True,
75
+ info="Picks words until their combined probability is at least top_p.",
76
+ ),
77
+ gr.Slider(
78
+ label="Repetition penalty",
79
+ value=1.05,
80
+ minimum=1.0,
81
+ maximum=1.2,
82
+ step=0.01,
83
+ interactive=True,
84
+ info="Repetition Penalty: Controls how much repetition is penalized.",
85
+ )
86
+ ]
87
+
88
+ gr.ChatInterface(chat,
89
+ chatbot=gr.Chatbot(show_label=False, height=500, show_copy_button=True, render_markdown=True),
90
+ textbox=gr.Textbox(placeholder="", container=False, scale=7),
91
+ title="Blossom 34B Demo",
92
+ description='Hello, I am Blossom, an open source conversational large language model.🌠'
93
+ '<a href="https://github.com/Azure99/BlossomLM">GitHub</a>',
94
+ theme="soft",
95
+ examples=[["Hello"], ["What is MBTI"], ["用Python实现二分查找"],
96
+ ["为switch写一篇小红书种草文案,带上emoji"]],
97
+ cache_examples=False,
98
+ additional_inputs=additional_inputs,
99
+ additional_inputs_accordion=gr.Accordion(label="Config", open=True),
100
+ clear_btn="🗑️Clear",
101
+ undo_btn="↩️Undo",
102
+ retry_btn="🔄Retry",
103
+ submit_btn="➡️Submit",
104
+ ).queue().launch()