BlinkDL commited on
Commit
296b533
1 Parent(s): f6b160e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -238
app.py CHANGED
@@ -1,189 +1,49 @@
1
- import os
2
- os.environ["RWKV_JIT_ON"] = '1'
3
- os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
4
- # make sure cuda dir is in the same level as modeling_rwkv.py
5
- from modeling_rwkv import RWKV
6
-
7
- import gc
8
  import gradio as gr
9
- import base64
10
- from io import BytesIO
11
- import torch
12
- import torch.nn.functional as F
13
  from datetime import datetime
14
- from transformers import CLIPImageProcessor
15
  from huggingface_hub import hf_hub_download
16
  from pynvml import *
17
  nvmlInit()
18
  gpu_h = nvmlDeviceGetHandleByIndex(0)
19
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
-
21
- ctx_limit = 2500
22
- gen_limit = 500
23
- ########################## text rwkv ################################################################
24
- from rwkv.utils import PIPELINE, PIPELINE_ARGS
25
 
26
- title_v6 = "RWKV-x060-World-3B-v2.1-20240417-ctx4096"
27
- model_path_v6 = hf_hub_download(repo_id="BlinkDL/rwkv-6-world", filename=f"{title_v6}.pth")
28
- model_v6 = RWKV(model=model_path_v6, strategy='cuda fp16')
29
- pipeline_v6 = PIPELINE(model_v6, "rwkv_vocab_v20230424")
30
 
31
- title = "RWKV-5-World-1B5-v2-20231025-ctx4096"
32
- model_path = hf_hub_download(repo_id="BlinkDL/rwkv-5-world", filename=f"{title}.pth")
 
33
  model = RWKV(model=model_path, strategy='cuda fp16')
 
34
  pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
35
 
36
- def generate_prompt(instruction, input=""):
37
- instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
38
- input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
39
- if input:
40
- return f"""Instruction: {instruction}
41
-
42
- Input: {input}
43
-
44
- Response:"""
45
- else:
46
- return f"""User: hi
47
-
48
- Assistant: Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.
49
-
50
- User: {instruction}
51
-
52
- Assistant:"""
53
 
54
  def evaluate(
55
  ctx,
56
- token_count=200,
57
  temperature=1.0,
58
- top_p=0.7,
59
- presencePenalty = 0.1,
60
- countPenalty = 0.1,
61
  ):
62
  args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
63
  alpha_frequency = countPenalty,
64
  alpha_presence = presencePenalty,
65
  token_ban = [], # ban the generation of some tokens
66
  token_stop = [0]) # stop generation whenever you see any token here
67
- ctx = ctx.strip()
68
  all_tokens = []
69
  out_last = 0
70
  out_str = ''
71
  occurrence = {}
72
  state = None
73
  for i in range(int(token_count)):
74
- input_ids = pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token]
75
- out, state = model_v6.forward(tokens=input_ids, state=state)
76
- for n in occurrence:
77
- out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
78
-
79
- token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
80
- if token in args.token_stop:
81
- break
82
- all_tokens += [token]
83
- for xxx in occurrence:
84
- occurrence[xxx] *= 0.994
85
-
86
- ttt = pipeline.decode([token])
87
- www = 1
88
- if ttt in ' \t0123456789':
89
- www = 0
90
- #elif ttt in '\r\n,.;?!"\':+-*/=#@$%^&_`~|<>\\()[]{},。;“”:?!()【】':
91
- # www = 0.5
92
- if token not in occurrence:
93
- occurrence[token] = www
94
- else:
95
- occurrence[token] += www
96
-
97
- tmp = pipeline.decode(all_tokens[out_last:])
98
- if '\ufffd' not in tmp:
99
- out_str += tmp
100
- yield out_str.strip()
101
- out_last = i + 1
102
-
103
- gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
104
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
105
- print(f'{timestamp} - vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')
106
- del out
107
- del state
108
- gc.collect()
109
- torch.cuda.empty_cache()
110
- yield out_str.strip()
111
-
112
- examples = [
113
- ["Assistant: How can we craft an engaging story featuring vampires on Mars? Let's think step by step and provide an expert response.", 500, 1, 0.3, 0, 1],
114
- ["Assistant: How can we persuade Elon Musk to follow you on Twitter? Let's think step by step and provide an expert response.", 500, 1, 0.3, 0, 1],
115
- [generate_prompt("東京で訪れるべき素晴らしい場所とその紹介をいくつか挙げてください。"), 500, 1, 0.3, 0, 1],
116
- [generate_prompt("Write a story using the following information.", "A man named Alex chops a tree down."), 500, 1, 0.3, 0, 1],
117
- ["A few light taps upon the pane made her turn to the window. It had begun to snow again.", 500, 1, 0.3, 0, 1],
118
- ['''Edward: I am Edward Elric from Fullmetal Alchemist.
119
-
120
- User: Hello Edward. What have you been up to recently?
121
-
122
- Edward:''', 500, 1, 0.3, 0, 1],
123
- ['''Japanese: 春の初め、桜の花が満開になる頃、小さな町の片隅にある古びた神社の境内は、特別な雰囲気に包まれていた。
124
-
125
- English:''', 500, 1, 0.3, 0, 1],
126
- ["En una pequeña aldea escondida entre las montañas de Andalucía, donde las calles aún conservaban el eco de antiguas leyendas, vivía un joven llamado Alejandro.", 500, 1, 0.3, 0, 1],
127
- ["Dans le cœur battant de Paris, sous le ciel teinté d'un crépuscule d'or et de pourpre, se tenait une petite librairie oubliée par le temps.", 500, 1, 0.3, 0, 1],
128
- ["في تطور مذهل وغير مسبوق، أعلنت السلطات المحلية في العاصمة عن اكتشاف أثري قد يغير مجرى التاريخ كما نعرفه.", 500, 1, 0.3, 0, 1],
129
- ['''“当然可以,大宇宙不会因为这五公斤就不坍缩了。”关一帆说,他还有一个没说出来的想法:也许大宇宙真的会因为相差一个原子的质量而由封闭转为开放。大自然的精巧有时超出想象,比如生命的诞生,就需要各项宇宙参数在几亿亿分之一精度上的精确配合。但程心仍然可以留下她的生态球,因为在那无数文明创造的无数小宇宙中,肯定有相当一部分不响应回归运动的号召,所以,大宇宙最终被夺走的质量至少有几亿吨,甚至可能是几亿亿亿吨。
130
- 但愿大宇宙能够忽略这个误差。
131
- 程心和关一帆进入了飞船,智子最后也进来了。她早就不再穿那身华丽的和服了,她现在身着迷彩服,再次成为一名轻捷精悍的战士,她的身上佩带着许多武器和生存装备,最引人注目的是那把插在背后的武士刀。
132
- “放心,我在,你们就在!”智子对两位人类朋友说。
133
- 聚变发动机启动了,推进器发出幽幽的蓝光,''', 500, 1, 0.3, 0, 1],
134
- ]
135
-
136
- ########################## visual rwkv ################################################################
137
- visual_title = 'ViusualRWKV-v5'
138
- rwkv_remote_path = "rwkv1b5-vitl336p14-577token_mix665k_rwkv.pth"
139
- vision_remote_path = "rwkv1b5-vitl336p14-577token_mix665k_visual.pth"
140
- vision_tower_name = 'openai/clip-vit-large-patch14-336'
141
-
142
- model_path = hf_hub_download(repo_id="howard-hou/visualrwkv-5", filename=rwkv_remote_path)
143
- visual_rwkv = RWKV(model=model_path, strategy='cuda fp16')
144
-
145
- ##########################################################################
146
- from modeling_vision import VisionEncoder, VisionEncoderConfig
147
- config = VisionEncoderConfig(n_embd=model.args.n_embd,
148
- vision_tower_name=vision_tower_name,
149
- grid_size=-1)
150
- visual_encoder = VisionEncoder(config)
151
- vision_local_path = hf_hub_download(repo_id="howard-hou/visualrwkv-5", filename=vision_remote_path)
152
- vision_state_dict = torch.load(vision_local_path, map_location='cpu')
153
- visual_encoder.load_state_dict(vision_state_dict)
154
- image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
155
- visual_encoder = visual_encoder.to(device)
156
- ##########################################################################
157
- def visual_generate_prompt(instruction):
158
- instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
159
- return f"\n{instruction}\n\nAssistant:"
160
-
161
- def generate(
162
- ctx,
163
- image_state,
164
- token_count=200,
165
- temperature=1.0,
166
- top_p=0.1,
167
- presencePenalty = 0.0,
168
- countPenalty = 1.0,
169
- ):
170
- args = PIPELINE_ARGS(temperature = 1.0, top_p = 0.1,
171
- alpha_frequency = 1.0,
172
- alpha_presence = 0.0,
173
- token_ban = [], # ban the generation of some tokens
174
- token_stop = [0, 261]) # stop generation whenever you see any token here
175
- ctx = ctx.strip()
176
- all_tokens = []
177
- out_last = 0
178
- out_str = ''
179
- occurrence = {}
180
- for i in range(int(token_count)):
181
- if i == 0:
182
- input_ids = pipeline.encode(ctx)[-ctx_limit:]
183
- out, state = visual_rwkv.forward(tokens=input_ids, state=image_state)
184
- else:
185
- input_ids = [token]
186
- out, state = visual_rwkv.forward(tokens=input_ids, state=state)
187
  for n in occurrence:
188
  out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
189
 
@@ -192,7 +52,7 @@ def generate(
192
  break
193
  all_tokens += [token]
194
  for xxx in occurrence:
195
- occurrence[xxx] *= 0.994
196
  if token not in occurrence:
197
  occurrence[token] = 1
198
  else:
@@ -206,77 +66,31 @@ def generate(
206
 
207
  gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
208
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
209
- print(f'{timestamp} - vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')
210
  del out
211
  del state
212
  gc.collect()
213
  torch.cuda.empty_cache()
214
  yield out_str.strip()
215
 
216
-
217
- ##########################################################################
218
- cur_dir = os.path.dirname(os.path.abspath(__file__))
219
- visual_examples = [
220
- [
221
- f"{cur_dir}/examples_pizza.jpg",
222
- "What are steps to cook it?"
223
- ],
224
- [
225
- f"{cur_dir}/examples_bluejay.jpg",
226
- "what is the name of this bird?",
227
- ],
228
- [
229
- f"{cur_dir}/examples_woman_and_dog.png",
230
- "describe this image",
231
- ],
232
  ]
233
 
 
234
 
235
- def pil_image_to_base64(pil_image):
236
- buffered = BytesIO()
237
- pil_image.save(buffered, format="JPEG") # You can change the format as needed (JPEG, PNG, etc.)
238
- # Encodes the image data into base64 format as a bytes object
239
- base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')
240
- return base64_image
241
-
242
- image_cache = {}
243
- ln0_weight = model.w['blocks.0.ln0.weight'].to(torch.float32).to(device)
244
- ln0_bias = model.w['blocks.0.ln0.bias'].to(torch.float32).to(device)
245
- def compute_image_state(image):
246
- base64_image = pil_image_to_base64(image)
247
- if base64_image in image_cache:
248
- image_state = image_cache[base64_image]
249
- else:
250
- image = image_processor(images=image.convert('RGB'), return_tensors='pt')['pixel_values'].to(device)
251
- image_features = visual_encoder.encode_images(image.unsqueeze(0)).squeeze(0) # [L, D]
252
- # apply layer norm to image feature, very important
253
- image_features = F.layer_norm(image_features,
254
- (image_features.shape[-1],),
255
- weight=ln0_weight,
256
- bias=ln0_bias)
257
- _, image_state = model.forward(embs=image_features, state=None)
258
- image_cache[base64_image] = image_state
259
- return image_state
260
-
261
- def chatbot(image, question):
262
- if image is None:
263
- yield "Please upload an image."
264
- return
265
- image_state = compute_image_state(image)
266
- input_text = visual_generate_prompt(question)
267
- for output in generate(input_text, image_state):
268
- yield output
269
-
270
-
271
- ##################################################################################################################
272
  with gr.Blocks(title=title) as demo:
273
- gr.HTML(f"<div style=\"text-align: center;\">\n<h1>{title_v6}</h1>\n</div>")
274
  with gr.Tab("Raw Generation"):
275
- gr.Markdown(f"This is [RWKV-6 World v2](https://huggingface.co/BlinkDL/rwkv-6-world) - a 100% attention-free RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM). Supports 100+ world languages and code. And we have [300+ Github RWKV projects](https://github.com/search?o=desc&p=1&q=rwkv&s=updated&type=Repositories). *** Please try examples first (bottom of page) *** (edit them to use your question). Demo limited to ctxlen {ctx_limit}. (VisualRWKV is using RWKV5 1.5B)")
276
  with gr.Row():
277
  with gr.Column():
278
- prompt = gr.Textbox(lines=2, label="Prompt", value="Assistant: How can we craft an engaging story featuring vampires on Mars? Let's think step by step and provide an expert response.")
279
- token_count = gr.Slider(10, gen_limit, label="Max Tokens", step=10, value=gen_limit)
280
  temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.0)
281
  top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.3)
282
  presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=0)
@@ -285,27 +99,11 @@ with gr.Blocks(title=title) as demo:
285
  with gr.Row():
286
  submit = gr.Button("Submit", variant="primary")
287
  clear = gr.Button("Clear", variant="secondary")
288
- output = gr.Textbox(label="Output", lines=30)
289
  data = gr.Dataset(components=[prompt, token_count, temperature, top_p, presence_penalty, count_penalty], samples=examples, samples_per_page=50, label="Example Instructions", headers=["Prompt", "Max Tokens", "Temperature", "Top P", "Presence Penalty", "Count Penalty"])
290
- submit.click(evaluate, [prompt, token_count, temperature, top_p, presence_penalty, count_penalty], [output])
291
  clear.click(lambda: None, [], [output])
292
  data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty])
293
- with gr.Tab("Visual RWKV"):
294
- with gr.Row():
295
- with gr.Column():
296
- image = gr.Image(type='pil', label="Image")
297
- with gr.Column():
298
- prompt = gr.Textbox(lines=8, label="Prompt",
299
- value="Render a clear and concise summary of the photo.")
300
- with gr.Row():
301
- submit = gr.Button("Submit", variant="primary")
302
- clear = gr.Button("Clear", variant="secondary")
303
- with gr.Column():
304
- output = gr.Textbox(label="Output", lines=10)
305
- data = gr.Dataset(components=[image, prompt], samples=visual_examples, label="Examples", headers=["Image", "Prompt"])
306
- submit.click(chatbot, [image, prompt], [output])
307
- clear.click(lambda: None, [], [output])
308
- data.click(lambda x: x, [data], [image, prompt])
309
 
310
- demo.queue(concurrency_count=1, max_size=10)
311
- demo.launch(share=False)
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import os, gc, copy, torch, re
 
 
 
3
  from datetime import datetime
 
4
  from huggingface_hub import hf_hub_download
5
  from pynvml import *
6
  nvmlInit()
7
  gpu_h = nvmlDeviceGetHandleByIndex(0)
8
+ ctx_limit = 1024
9
+ title = "RWKV-x060-eng_single_round_test-1B6-20240427-ctx1024"
 
 
 
 
10
 
11
+ os.environ["RWKV_JIT_ON"] = '1'
12
+ os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
 
 
13
 
14
+ from rwkv.model import RWKV
15
+ model_path = hf_hub_download(repo_id="BlinkDL/temp-latest-training-models", filename=f"{title}.pth")
16
+ # model_path = f"E:/{title}"
17
  model = RWKV(model=model_path, strategy='cuda fp16')
18
+ from rwkv.utils import PIPELINE, PIPELINE_ARGS
19
  pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
20
 
21
+ def generate_prompt(instruction):
22
+ instruction = instruction.strip().replace('\r\n','\n')
23
+ instruction = re.sub(r'\n+', '\n', instruction)
24
+ return f"User: {instruction}\n\nAssistant:"""
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def evaluate(
27
  ctx,
28
+ token_count=500,
29
  temperature=1.0,
30
+ top_p=0.3,
31
+ presencePenalty = 0.3,
32
+ countPenalty = 0.3,
33
  ):
34
  args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
35
  alpha_frequency = countPenalty,
36
  alpha_presence = presencePenalty,
37
  token_ban = [], # ban the generation of some tokens
38
  token_stop = [0]) # stop generation whenever you see any token here
39
+ ctx = generate_prompt(ctx)
40
  all_tokens = []
41
  out_last = 0
42
  out_str = ''
43
  occurrence = {}
44
  state = None
45
  for i in range(int(token_count)):
46
+ out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  for n in occurrence:
48
  out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
49
 
 
52
  break
53
  all_tokens += [token]
54
  for xxx in occurrence:
55
+ occurrence[xxx] *= 0.996
56
  if token not in occurrence:
57
  occurrence[token] = 1
58
  else:
 
66
 
67
  gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
68
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
69
+ print(f'{timestamp} - vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')
70
  del out
71
  del state
72
  gc.collect()
73
  torch.cuda.empty_cache()
74
  yield out_str.strip()
75
 
76
+ examples = [
77
+ ["How can I craft an engaging story featuring vampires on Mars?", 700, 1, 0.3, 0.3, 0.3],
78
+ ["Write a simple website in HTML. When a user clicks the button, it shows a random joke from a list of 4 jokes.", 700, 1, 0.3, 0.3, 0.3],
79
+ ["Write C++ code to land on moon.", 700, 1, 0.3, 0.3, 0.3],
80
+ ["Write a story using the following information: a man named Alex chops a tree down.", 700, 1, 0.3, 0.3, 0.3],
81
+ ["How can I persuade Elon Musk to follow me on Twitter?", 700, 1, 0.3, 0.3, 0.3],
 
 
 
 
 
 
 
 
 
 
82
  ]
83
 
84
+ ##########################################################################
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  with gr.Blocks(title=title) as demo:
87
+ gr.HTML(f"<div style=\"text-align: center;\">\n<h1>{title}</h1>\n</div>")
88
  with gr.Tab("Raw Generation"):
89
+ gr.Markdown(f"This is [RWKV-6](https://huggingface.co/BlinkDL/temp-latest-training-models) with 1.6B params tuned on <b>single-round English</b> Q & A - a 100% attention-free RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM). And we have [200+ Github RWKV projects](https://github.com/search?o=desc&p=1&q=rwkv&s=updated&type=Repositories). Demo limited to ctxlen {ctx_limit}.")
90
  with gr.Row():
91
  with gr.Column():
92
+ prompt = gr.Textbox(lines=2, label="Prompt", value="How can we craft an engaging story featuring vampires on Mars?")
93
+ token_count = gr.Slider(10, 700, label="Max Tokens", step=10, value=700)
94
  temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.0)
95
  top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.3)
96
  presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=0)
 
99
  with gr.Row():
100
  submit = gr.Button("Submit", variant="primary")
101
  clear = gr.Button("Clear", variant="secondary")
102
+ output = gr.Textbox(label="Output", lines=50)
103
  data = gr.Dataset(components=[prompt, token_count, temperature, top_p, presence_penalty, count_penalty], samples=examples, samples_per_page=50, label="Example Instructions", headers=["Prompt", "Max Tokens", "Temperature", "Top P", "Presence Penalty", "Count Penalty"])
104
+ submit.click(evaluate, [prompt, token_count, temperature, top_p, presence_penalty, count_penalty], [output], concurrency_limit=1)
105
  clear.click(lambda: None, [], [output])
106
  data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ demo.queue(max_size=10)
109
+ demo.launch(share=False)