File size: 12,241 Bytes
9dc0e21
8607d84
8988bbf
8607d84
8988bbf
8607d84
8988bbf
 
 
 
f0929ee
8607d84
 
 
 
 
 
726a01e
8607d84
 
 
726a01e
 
 
 
 
 
 
 
8607d84
726a01e
 
8607d84
 
 
 
726a01e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8607d84
f0929ee
 
8607d84
 
f0929ee
 
df2bf3e
f0929ee
9dc0e21
 
c38b609
e74047c
973bde6
c619300
8988bbf
f29252d
c619300
 
5799733
459fbe3
9dc0e21
 
 
 
2fa4e4c
b099d9e
 
931d3ff
 
b099d9e
 
931d3ff
 
459fbe3
 
5658533
b420ebd
c38b609
 
 
 
 
931d3ff
c38b609
f0929ee
5658533
4e4c514
c38b609
973bde6
 
 
e74047c
2fa4e4c
 
 
98b5498
 
 
 
2fa4e4c
 
 
 
 
 
e74047c
459fbe3
 
 
726a01e
e74047c
 
98b5498
 
d8e1d2b
df2bf3e
726a01e
2fa4e4c
726a01e
98b5498
2fa4e4c
 
 
b099d9e
 
 
34f1177
 
 
123d4a3
2fa4e4c
df2bf3e
98b5498
2fa4e4c
df2bf3e
98b5498
04a7868
d47c36f
 
d8e1d2b
 
 
 
 
 
 
d47c36f
98b5498
2fa4e4c
df2bf3e
2fa4e4c
d47c36f
98b5498
2fa4e4c
df2bf3e
e52ef2a
 
 
98b5498
e52ef2a
04a7868
123d4a3
e52ef2a
d47c36f
e74047c
2fa4e4c
e74047c
b099d9e
9dc0e21
2fa4e4c
3ac04fa
b099d9e
 
e74047c
2fa4e4c
e74047c
 
b6dd571
b099d9e
e74047c
 
b099d9e
e52ef2a
4e4c514
726a01e
 
4e4c514
b70508d
 
df2bf3e
726a01e
b70508d
726a01e
 
 
 
 
b70508d
 
2fa4e4c
726a01e
 
a752f35
726a01e
 
2fa4e4c
 
 
c619300
 
df2bf3e
b70508d
 
 
 
 
2fa4e4c
 
b70508d
 
 
 
2fa4e4c
 
9dc0e21
c619300
 
 
 
9dc0e21
 
 
2fa4e4c
7d0829c
0064c50
aa1923f
0064c50
7d0829c
 
d47c36f
d8e1d2b
98b5498
 
aa1923f
0064c50
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
"""
## convert to gguf

python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/

## predict
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv


## timing


**重庆GPU服务器,cache为空 **
llama_print_timings:        load time =    1711.48 ms
llama_print_timings:      sample time =      73.89 ms /    41 runs   (    1.80 ms per token,   554.84 tokens per second)
llama_print_timings: prompt eval time =    2621.25 ms /     5 tokens (  524.25 ms per token,     1.91 tokens per second)   # 0.2-0.5秒/token
llama_print_timings:        eval time =    1430.91 ms /    40 runs   (   35.77 ms per token,    27.95 tokens per second)
llama_print_timings:       total time =    4848.09 ms /    45 tokens

llama_print_timings:        load time =    1939.72 ms
llama_print_timings:      sample time =     286.69 ms /   170 runs   (    1.69 ms per token,   592.99 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (    -nan ms per token,     -nan tokens per second)  # warmup后,加速明显。
llama_print_timings:        eval time =    5737.50 ms /   170 runs   (   33.75 ms per token,    29.63 tokens per second)
llama_print_timings:       total time =    8219.82 ms /   170 tokens


**hf-space,cache为空 (关闭GGML_BLAS) ** -----------
llama_print_timings:        load time =   28230.06 ms
llama_print_timings:      sample time =     147.58 ms /     8 runs   (   18.45 ms per token,    54.21 tokens per second)   # 18ms/token
llama_print_timings: prompt eval time =   28864.82 ms /     5 tokens ( 5772.96 ms per token,     0.17 tokens per second)   # 5.7s/token
llama_print_timings:        eval time =    1557.94 ms /     7 runs   (  222.56 ms per token,     4.49 tokens per second)
llama_print_timings:       total time =   30753.48 ms /    12 tokens


**hf-space,cache为空 (开启GGML_BLAS)** -----------
llama_print_timings:        load time =   27347.29 ms
llama_print_timings:      sample time =      82.53 ms /    26 runs   (    3.17 ms per token,   315.05 tokens per second)   # 3ms/token
llama_print_timings: prompt eval time =   28855.64 ms /     9 tokens ( 3206.18 ms per token,     0.31 tokens per second)   # 3s/token
llama_print_timings:        eval time =    9810.01 ms /    25 runs   (  392.40 ms per token,     2.55 tokens per second)
llama_print_timings:       total time =   39073.77 ms /    34 tokens

llama_print_timings:        load time =   27347.29 ms
llama_print_timings:      sample time =     272.12 ms /    96 runs   (    2.83 ms per token,   352.79 tokens per second)   # 2.8ms/token
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (    -nan ms per token,     -nan tokens per second)
llama_print_timings:        eval time =   19974.85 ms /    96 runs   (  208.07 ms per token,     4.81 tokens per second)
llama_print_timings:       total time =   22517.08 ms /    96 tokens


## TODO:

- 解决warmup慢的问题
- 支持cache,并提前对所有预设system进行cache。

## reference

- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
- https://github.com/awinml/llama-cpp-python-bindings
- https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/model.py
- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
"""

import json
import copy
import os
import psutil
import llama_cpp
from transformers import AutoTokenizer

from models.base_model import Simulator
from utils.logging_util import logger
import config


class Qwen2Simulator(Simulator):

    def __init__(self, system_list=None):
        local_path = "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf"
        if os.path.exists(local_path):
            self.hf_tokenizer = AutoTokenizer.from_pretrained(
                "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/")
            self.llm = llama_cpp.Llama(  # n_ctx, n_threads
                model_path=local_path,
                # 默认的tokenizer有bug,tokenize后的id不同
                tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
                n_ctx=config.MAX_SEQUENCE_LENGTH,  #
                # n_threads=None, # 默认会根据cpu数来设置 n_threads
                # use_mlock=True,
                verbose=True,
            )
        else:
            self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
            self.llm = llama_cpp.Llama.from_pretrained(
                repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
                tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
                filename="*fp16.gguf",
                n_ctx=config.MAX_SEQUENCE_LENGTH,
                # use_mlock=True,
                verbose=True,
            )
        logger.info(f"llm has been initialized: {self.llm}, "
                    f"n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}, "
                    f"env[CACHE]={os.environ.get('CACHE', None)}")


        # qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|>
        self.assistant_stop_words = [
            "<|im_end|>",
            "<|im_start|>",
            "<|endoftext|>",
        ]
        self.assistant_stop_tokens = self.tokenize("".join(self.assistant_stop_words))
        self.user_stop_words = self.assistant_stop_words + ["?", "?"]
        self.user_stop_tokens = self.tokenize("".join(self.user_stop_words))
        logger.info(f"assistant_stop_tokens: {self.assistant_stop_tokens}")
        logger.info(f"user_stop_tokens: {self.user_stop_tokens}")

        self.generation_kwargs = dict(
            temperature=config.DEFAULT_TEMPERATURE,
            top_p=config.DEFAULT_TOP_P,
            top_k=config.DEFAULT_TOP_K,
            max_tokens=config.DEFAULT_MAX_NEW_TOKENS,
            repeat_penalty=1.1,
        )
        self.user_start_tokens = self.tokenize("<|im_start|>user\n")
        self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
        # self.llm.generate  .set_cache   .last_n_tokens_size  .reset  .ctx ._ctx

        # cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
        cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30)  # 2G
        self.llm.set_cache(cache)

        if system_list is not None:
            self.pre_cache_system(system_list)

    def tokenize(self, text):
        return self.llm.tokenize(text.encode("utf-8"))

    def detokenize(self, tokens):
        return self.llm.detokenize(tokens).decode("utf-8")

    def strip_stoptokens(self, tokens):
        while tokens and tokens[0] in self.assistant_stop_tokens:
            logger.info(f"head-striping {tokens[0]} {self.detokenize([tokens[0]])}")
            tokens.pop(0)
        while tokens and tokens[-1] in self.assistant_stop_tokens:
            logger.info(f"tail-striping {tokens[-1]} {self.detokenize([tokens[-1]])}")
            tokens.pop()
        return tokens

    def generate(self, history, stream=True):
        """
        额外前向:remains 5 to forward "<|im_end|>\n<|im_start|>assistant\n"

        :param history:
        :param stream:
        :return:
        """
        if history[-1]['role'] in ["user"]:
            start_tokens = self.assistant_start_tokens
            stop_words = self.assistant_stop_words
            suffix_tokens = self.user_start_tokens

        elif history[-1]['role'] in ["assistant", "system"]:
            start_tokens = self.user_start_tokens
            stop_words = self.user_stop_words
            suffix_tokens = self.assistant_start_tokens

        input_ids = []
        for message in history:
            if "tokens" not in message:  # tokens
                message["tokens"] = self.tokenize(message["content"])
            input_ids += self.tokenize(f"<|im_start|>{message['role']}\n") \
                         + message["tokens"] \
                         + self.tokenize("<|im_end|>\n")
        input_ids += start_tokens
        if stream:
            return self._stream_generate(input_ids, stop_words, suffix_tokens)
        else:
            return self._generate(input_ids)

    def _stream_generate(self, input_ids, stop_words, suffix_tokens=None):
        logger.info(f"generation_kwargs {self.generation_kwargs}")
        output = self.llm.create_completion(
            input_ids,
            stream=True,
            stop=stop_words,
            **self.generation_kwargs
        )
        # TODO: 检测finish reason,如果是length,则shift,并继续生成。
        # TODO: 返回 token_id,
        for out in output:
            stream = copy.deepcopy(out)
            if stream["choices"][0]["finish_reason"] is None:
                yield stream["choices"][0]["completion_text"], stream["choices"][0]["completion_tokens"]
            else:
                logger.info(
                    f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')

        #
        self.post_cache(suffix_tokens)

    def pre_cache_system(self, system_list):
        """ warmup for system prompt
        :param system_list:
        :return:
        """
        logger.info(f"cache size {self.llm.cache.cache_size}")
        for system_prompt in system_list:
            logger.info(f"pre caching '{system_prompt}'")
            input_ids = self.tokenize(f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n")
            _output = self.llm.create_completion(
                input_ids,
                stream=False,
                max_tokens=1,
                top_k=1
            )
            logger.info(
                f"cache size {self.llm.cache.cache_size}={self.llm.cache.cache_size / 1024 / 1024 / 1024:.2f} GB, "
                f"process_mem: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024:.2f} GB")

        self._disable_cache()

    def post_cache(self, suffix_tokens):
        """ warmup for next turn generation
        :param suffix_tokens:
        :return:
        """
        logger.info(f"cache size {self.llm.cache.cache_size}={self.llm.cache.cache_size / 1024 / 1024 / 1024:.2f} GB, "
                    f"process_mem: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024:.2f} GB")
        if suffix_tokens:
            logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
            self.llm.eval([151645, 198] + suffix_tokens)  # <|im_end|>\n
            logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
        logger.info(f"cache size {self.llm.cache.cache_size}={self.llm.cache.cache_size / 1024 / 1024 / 1024:.2f} GB, "
                    f"process_mem: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024:.2f} GB")

    def _disable_cache(self):
        llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
        llama_cpp.Llama.save_state = lambda *args: None


if __name__ == "__main__":

    bot = Qwen2Simulator()
    messages = [{"role": "system", "content": "你是一个导游。"}]
    generated_tokens = None
    print("######## requesting", messages)
    for generated_text, generated_tokens in bot.generate(messages, stream=True):
        print(generated_text, generated_tokens)

    for i in range(3):
        generated_tokens = bot.strip_stoptokens(generated_tokens)
        messages.append(
            {"role": "user" if i % 2 == 0 else "assistant", "content": generated_text, "tokens": generated_tokens})
        print("######## requesting", messages)
        for generated_text, generated_tokens in bot.generate(messages, stream=True):
            pass
            # print(generated_text, all_tokens)