File size: 12,241 Bytes
9dc0e21 8607d84 8988bbf 8607d84 8988bbf 8607d84 8988bbf f0929ee 8607d84 726a01e 8607d84 726a01e 8607d84 726a01e 8607d84 726a01e 8607d84 f0929ee 8607d84 f0929ee df2bf3e f0929ee 9dc0e21 c38b609 e74047c 973bde6 c619300 8988bbf f29252d c619300 5799733 459fbe3 9dc0e21 2fa4e4c b099d9e 931d3ff b099d9e 931d3ff 459fbe3 5658533 b420ebd c38b609 931d3ff c38b609 f0929ee 5658533 4e4c514 c38b609 973bde6 e74047c 2fa4e4c 98b5498 2fa4e4c e74047c 459fbe3 726a01e e74047c 98b5498 d8e1d2b df2bf3e 726a01e 2fa4e4c 726a01e 98b5498 2fa4e4c b099d9e 34f1177 123d4a3 2fa4e4c df2bf3e 98b5498 2fa4e4c df2bf3e 98b5498 04a7868 d47c36f d8e1d2b d47c36f 98b5498 2fa4e4c df2bf3e 2fa4e4c d47c36f 98b5498 2fa4e4c df2bf3e e52ef2a 98b5498 e52ef2a 04a7868 123d4a3 e52ef2a d47c36f e74047c 2fa4e4c e74047c b099d9e 9dc0e21 2fa4e4c 3ac04fa b099d9e e74047c 2fa4e4c e74047c b6dd571 b099d9e e74047c b099d9e e52ef2a 4e4c514 726a01e 4e4c514 b70508d df2bf3e 726a01e b70508d 726a01e b70508d 2fa4e4c 726a01e a752f35 726a01e 2fa4e4c c619300 df2bf3e b70508d 2fa4e4c b70508d 2fa4e4c 9dc0e21 c619300 9dc0e21 2fa4e4c 7d0829c 0064c50 aa1923f 0064c50 7d0829c d47c36f d8e1d2b 98b5498 aa1923f 0064c50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 |
"""
## convert to gguf
python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/
## predict
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv
## timing
**重庆GPU服务器,cache为空 **
llama_print_timings: load time = 1711.48 ms
llama_print_timings: sample time = 73.89 ms / 41 runs ( 1.80 ms per token, 554.84 tokens per second)
llama_print_timings: prompt eval time = 2621.25 ms / 5 tokens ( 524.25 ms per token, 1.91 tokens per second) # 0.2-0.5秒/token
llama_print_timings: eval time = 1430.91 ms / 40 runs ( 35.77 ms per token, 27.95 tokens per second)
llama_print_timings: total time = 4848.09 ms / 45 tokens
llama_print_timings: load time = 1939.72 ms
llama_print_timings: sample time = 286.69 ms / 170 runs ( 1.69 ms per token, 592.99 tokens per second)
llama_print_timings: prompt eval time = 0.00 ms / 0 tokens ( -nan ms per token, -nan tokens per second) # warmup后,加速明显。
llama_print_timings: eval time = 5737.50 ms / 170 runs ( 33.75 ms per token, 29.63 tokens per second)
llama_print_timings: total time = 8219.82 ms / 170 tokens
**hf-space,cache为空 (关闭GGML_BLAS) ** -----------
llama_print_timings: load time = 28230.06 ms
llama_print_timings: sample time = 147.58 ms / 8 runs ( 18.45 ms per token, 54.21 tokens per second) # 18ms/token
llama_print_timings: prompt eval time = 28864.82 ms / 5 tokens ( 5772.96 ms per token, 0.17 tokens per second) # 5.7s/token
llama_print_timings: eval time = 1557.94 ms / 7 runs ( 222.56 ms per token, 4.49 tokens per second)
llama_print_timings: total time = 30753.48 ms / 12 tokens
**hf-space,cache为空 (开启GGML_BLAS)** -----------
llama_print_timings: load time = 27347.29 ms
llama_print_timings: sample time = 82.53 ms / 26 runs ( 3.17 ms per token, 315.05 tokens per second) # 3ms/token
llama_print_timings: prompt eval time = 28855.64 ms / 9 tokens ( 3206.18 ms per token, 0.31 tokens per second) # 3s/token
llama_print_timings: eval time = 9810.01 ms / 25 runs ( 392.40 ms per token, 2.55 tokens per second)
llama_print_timings: total time = 39073.77 ms / 34 tokens
llama_print_timings: load time = 27347.29 ms
llama_print_timings: sample time = 272.12 ms / 96 runs ( 2.83 ms per token, 352.79 tokens per second) # 2.8ms/token
llama_print_timings: prompt eval time = 0.00 ms / 0 tokens ( -nan ms per token, -nan tokens per second)
llama_print_timings: eval time = 19974.85 ms / 96 runs ( 208.07 ms per token, 4.81 tokens per second)
llama_print_timings: total time = 22517.08 ms / 96 tokens
## TODO:
- 解决warmup慢的问题
- 支持cache,并提前对所有预设system进行cache。
## reference
- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
- https://github.com/awinml/llama-cpp-python-bindings
- https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/model.py
- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
"""
import json
import copy
import os
import psutil
import llama_cpp
from transformers import AutoTokenizer
from models.base_model import Simulator
from utils.logging_util import logger
import config
class Qwen2Simulator(Simulator):
def __init__(self, system_list=None):
local_path = "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf"
if os.path.exists(local_path):
self.hf_tokenizer = AutoTokenizer.from_pretrained(
"/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/")
self.llm = llama_cpp.Llama( # n_ctx, n_threads
model_path=local_path,
# 默认的tokenizer有bug,tokenize后的id不同
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
n_ctx=config.MAX_SEQUENCE_LENGTH, #
# n_threads=None, # 默认会根据cpu数来设置 n_threads
# use_mlock=True,
verbose=True,
)
else:
self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
self.llm = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
filename="*fp16.gguf",
n_ctx=config.MAX_SEQUENCE_LENGTH,
# use_mlock=True,
verbose=True,
)
logger.info(f"llm has been initialized: {self.llm}, "
f"n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}, "
f"env[CACHE]={os.environ.get('CACHE', None)}")
# qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|>
self.assistant_stop_words = [
"<|im_end|>",
"<|im_start|>",
"<|endoftext|>",
]
self.assistant_stop_tokens = self.tokenize("".join(self.assistant_stop_words))
self.user_stop_words = self.assistant_stop_words + ["?", "?"]
self.user_stop_tokens = self.tokenize("".join(self.user_stop_words))
logger.info(f"assistant_stop_tokens: {self.assistant_stop_tokens}")
logger.info(f"user_stop_tokens: {self.user_stop_tokens}")
self.generation_kwargs = dict(
temperature=config.DEFAULT_TEMPERATURE,
top_p=config.DEFAULT_TOP_P,
top_k=config.DEFAULT_TOP_K,
max_tokens=config.DEFAULT_MAX_NEW_TOKENS,
repeat_penalty=1.1,
)
self.user_start_tokens = self.tokenize("<|im_start|>user\n")
self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
# self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
# cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30) # 2G
self.llm.set_cache(cache)
if system_list is not None:
self.pre_cache_system(system_list)
def tokenize(self, text):
return self.llm.tokenize(text.encode("utf-8"))
def detokenize(self, tokens):
return self.llm.detokenize(tokens).decode("utf-8")
def strip_stoptokens(self, tokens):
while tokens and tokens[0] in self.assistant_stop_tokens:
logger.info(f"head-striping {tokens[0]} {self.detokenize([tokens[0]])}")
tokens.pop(0)
while tokens and tokens[-1] in self.assistant_stop_tokens:
logger.info(f"tail-striping {tokens[-1]} {self.detokenize([tokens[-1]])}")
tokens.pop()
return tokens
def generate(self, history, stream=True):
"""
额外前向:remains 5 to forward "<|im_end|>\n<|im_start|>assistant\n"
:param history:
:param stream:
:return:
"""
if history[-1]['role'] in ["user"]:
start_tokens = self.assistant_start_tokens
stop_words = self.assistant_stop_words
suffix_tokens = self.user_start_tokens
elif history[-1]['role'] in ["assistant", "system"]:
start_tokens = self.user_start_tokens
stop_words = self.user_stop_words
suffix_tokens = self.assistant_start_tokens
input_ids = []
for message in history:
if "tokens" not in message: # tokens
message["tokens"] = self.tokenize(message["content"])
input_ids += self.tokenize(f"<|im_start|>{message['role']}\n") \
+ message["tokens"] \
+ self.tokenize("<|im_end|>\n")
input_ids += start_tokens
if stream:
return self._stream_generate(input_ids, stop_words, suffix_tokens)
else:
return self._generate(input_ids)
def _stream_generate(self, input_ids, stop_words, suffix_tokens=None):
logger.info(f"generation_kwargs {self.generation_kwargs}")
output = self.llm.create_completion(
input_ids,
stream=True,
stop=stop_words,
**self.generation_kwargs
)
# TODO: 检测finish reason,如果是length,则shift,并继续生成。
# TODO: 返回 token_id,
for out in output:
stream = copy.deepcopy(out)
if stream["choices"][0]["finish_reason"] is None:
yield stream["choices"][0]["completion_text"], stream["choices"][0]["completion_tokens"]
else:
logger.info(
f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
#
self.post_cache(suffix_tokens)
def pre_cache_system(self, system_list):
""" warmup for system prompt
:param system_list:
:return:
"""
logger.info(f"cache size {self.llm.cache.cache_size}")
for system_prompt in system_list:
logger.info(f"pre caching '{system_prompt}'")
input_ids = self.tokenize(f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n")
_output = self.llm.create_completion(
input_ids,
stream=False,
max_tokens=1,
top_k=1
)
logger.info(
f"cache size {self.llm.cache.cache_size}={self.llm.cache.cache_size / 1024 / 1024 / 1024:.2f} GB, "
f"process_mem: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024:.2f} GB")
self._disable_cache()
def post_cache(self, suffix_tokens):
""" warmup for next turn generation
:param suffix_tokens:
:return:
"""
logger.info(f"cache size {self.llm.cache.cache_size}={self.llm.cache.cache_size / 1024 / 1024 / 1024:.2f} GB, "
f"process_mem: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024:.2f} GB")
if suffix_tokens:
logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
self.llm.eval([151645, 198] + suffix_tokens) # <|im_end|>\n
logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
logger.info(f"cache size {self.llm.cache.cache_size}={self.llm.cache.cache_size / 1024 / 1024 / 1024:.2f} GB, "
f"process_mem: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024:.2f} GB")
def _disable_cache(self):
llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
llama_cpp.Llama.save_state = lambda *args: None
if __name__ == "__main__":
bot = Qwen2Simulator()
messages = [{"role": "system", "content": "你是一个导游。"}]
generated_tokens = None
print("######## requesting", messages)
for generated_text, generated_tokens in bot.generate(messages, stream=True):
print(generated_text, generated_tokens)
for i in range(3):
generated_tokens = bot.strip_stoptokens(generated_tokens)
messages.append(
{"role": "user" if i % 2 == 0 else "assistant", "content": generated_text, "tokens": generated_tokens})
print("######## requesting", messages)
for generated_text, generated_tokens in bot.generate(messages, stream=True):
pass
# print(generated_text, all_tokens)
|