erichilarysmithsr MAGAer13 commited on
Commit
b537101
0 Parent(s):

Duplicate from MAGAer13/mPLUG-Owl

Browse files

Co-authored-by: QinghaoYe <MAGAer13@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ .pt filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MPLUG Owl
3
+ emoji: 🦉
4
+ colorFrom: gray
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.28.1
8
+ app_file: app.py
9
+ pinned: false
10
+ arxiv: 2304.14178
11
+ license: apache-2.0
12
+ duplicated_from: MAGAer13/mPLUG-Owl
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import wget
3
+ resources = os.getenv('resources_new')
4
+ resources_filename = wget.download(resources)
5
+
6
+ os.system('tar zxvf {}'.format(resources_filename))
7
+ os.system('ls -l')
8
+
9
+ import argparse
10
+ import datetime
11
+ import json
12
+ import os
13
+ import time
14
+ import torch
15
+
16
+ import gradio as gr
17
+ import requests
18
+
19
+ from conversation import default_conversation
20
+ from gradio_css import code_highlight_css
21
+ from gradio_patch import Chatbot as grChatbot
22
+ from serve_utils import (
23
+ add_text, after_process_image, disable_btn, no_change_btn,
24
+ downvote_last_response, enable_btn, flag_last_response,
25
+ get_window_url_params, init, regenerate, upvote_last_response,
26
+ after_process_video
27
+ )
28
+ from model_worker import mPLUG_Owl_Server
29
+ from model_utils import post_process_code
30
+
31
+ SHARED_UI_WARNING = f'''### [NOTE] You can duplicate and use it with a paid private GPU.
32
+ <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/MAGAer13/mPLUG-Owl?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-md.svg" alt="Duplicate Space"></a>
33
+ '''
34
+
35
+
36
+ def load_demo(url_params, request: gr.Request):
37
+
38
+ dropdown_update = gr.Dropdown.update(visible=True)
39
+ state = default_conversation.copy()
40
+
41
+ return (state,
42
+ dropdown_update,
43
+ gr.Chatbot.update(visible=True),
44
+ gr.Textbox.update(visible=True),
45
+ gr.Button.update(visible=True),
46
+ gr.Row.update(visible=True),
47
+ gr.Accordion.update(visible=True))
48
+
49
+ def clear_history(request: gr.Request):
50
+ state = default_conversation.copy()
51
+
52
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
53
+
54
+ def http_bot(state, max_output_tokens, temperature, top_k, top_p,
55
+ num_beams, no_repeat_ngram_size, length_penalty,
56
+ do_sample, request: gr.Request):
57
+ if state.skip_next:
58
+ # This generate call is skipped due to invalid inputs
59
+ yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
60
+ return
61
+
62
+ prompt = after_process_image(state.get_prompt())
63
+ images = state.get_images()
64
+
65
+ data = {
66
+ "text_input": prompt,
67
+ "images": images if len(images) > 0 else [],
68
+ "generation_config": {
69
+ "top_k": int(top_k),
70
+ "top_p": float(top_p),
71
+ "num_beams": int(num_beams),
72
+ "no_repeat_ngram_size": int(no_repeat_ngram_size),
73
+ "length_penalty": float(length_penalty),
74
+ "do_sample": bool(do_sample),
75
+ "temperature": float(temperature),
76
+ "max_new_tokens": min(int(max_output_tokens), 1536),
77
+ }
78
+ }
79
+
80
+ state.messages[-1][-1] = "▌"
81
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
82
+
83
+ try:
84
+ for chunk in model.predict(data):
85
+ if chunk:
86
+ if chunk[1]:
87
+ output = chunk[0].strip()
88
+ output = post_process_code(output)
89
+ state.messages[-1][-1] = output + "▌"
90
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
91
+ else:
92
+ output = chunk[0].strip()
93
+ state.messages[-1][-1] = output
94
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
95
+ return
96
+ time.sleep(0.03)
97
+
98
+ except requests.exceptions.RequestException as e:
99
+ state.messages[-1][-1] = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
100
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
101
+ return
102
+
103
+ state.messages[-1][-1] = state.messages[-1][-1][:-1]
104
+ yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
105
+
106
+
107
+ def add_text_http_bot(
108
+ state, text, image, video, num_frames,
109
+ max_output_tokens, temperature, top_k, top_p,
110
+ num_beams, no_repeat_ngram_size, length_penalty,
111
+ do_sample, request: gr.Request):
112
+ if len(text) <= 0 and image is None and video is None:
113
+ state.skip_next = True
114
+ return (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
115
+
116
+ if image is not None:
117
+ if '<image>' not in text:
118
+ text = text + '\n<image>'
119
+ text = (text, image)
120
+
121
+ if video is not None:
122
+ if '<|video|>' not in text:
123
+ text = text + '\n<|video|>'
124
+ text = (text, video)
125
+
126
+ state.append_message(state.roles[0], text)
127
+ state.append_message(state.roles[1], None)
128
+ state.skip_next = False
129
+
130
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
131
+
132
+ if state.skip_next:
133
+ # This generate call is skipped due to invalid inputs
134
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
135
+ return
136
+
137
+ prompt = state.get_prompt(num_frames)
138
+ prompt = after_process_image(prompt)
139
+ prompt = after_process_video(prompt)
140
+ prompt = prompt.replace("Human: \n", "")
141
+
142
+ images = state.get_images()
143
+ videos = state.get_videos(num_frames)
144
+
145
+ data = {
146
+ "text_input": prompt,
147
+ "images": images if len(images) > 0 else [],
148
+ "videos": videos if len(videos) > 0 else [],
149
+ "video": video if video is not None else None,
150
+ "generation_config": {
151
+ "top_k": int(top_k),
152
+ "top_p": float(top_p),
153
+ "num_beams": int(num_beams),
154
+ "no_repeat_ngram_size": int(no_repeat_ngram_size),
155
+ "length_penalty": float(length_penalty),
156
+ "do_sample": bool(do_sample),
157
+ "temperature": float(temperature),
158
+ "max_new_tokens": min(int(max_output_tokens), 1536),
159
+ }
160
+ }
161
+
162
+ state.messages[-1][-1] = "▌"
163
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
164
+
165
+ try:
166
+ for chunk in model.predict(data):
167
+ if chunk:
168
+ if chunk[1]:
169
+ output = chunk[0].strip()
170
+ output = post_process_code(output)
171
+ state.messages[-1][-1] = output + "▌"
172
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
173
+ else:
174
+ output = chunk[0].strip()
175
+ state.messages[-1][-1] = output
176
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
177
+ return
178
+ time.sleep(0.03)
179
+
180
+ except requests.exceptions.RequestException as e:
181
+ state.messages[-1][-1] = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
182
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
183
+ return
184
+
185
+ state.messages[-1][-1] = state.messages[-1][-1][:-1]
186
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (enable_btn,) * 5
187
+
188
+
189
+ def regenerate_http_bot(state, num_frames,
190
+ max_output_tokens, temperature, top_k, top_p,
191
+ num_beams, no_repeat_ngram_size, length_penalty,
192
+ do_sample, request: gr.Request):
193
+ state.messages[-1][-1] = None
194
+ state.skip_next = False
195
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
196
+
197
+ prompt = after_process_image(state.get_prompt(num_frames))
198
+ images = state.get_images()
199
+ videos = state.get_videos(num_frames)
200
+
201
+ data = {
202
+ "text_input": prompt,
203
+ "images": images if len(images) > 0 else [],
204
+ "videos": videos if len(videos) > 0 else [],
205
+ "generation_config": {
206
+ "top_k": int(top_k),
207
+ "top_p": float(top_p),
208
+ "num_beams": int(num_beams),
209
+ "no_repeat_ngram_size": int(no_repeat_ngram_size),
210
+ "length_penalty": float(length_penalty),
211
+ "do_sample": bool(do_sample),
212
+ "temperature": float(temperature),
213
+ "max_new_tokens": min(int(max_output_tokens), 1536),
214
+ }
215
+ }
216
+
217
+ state.messages[-1][-1] = "▌"
218
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
219
+
220
+ try:
221
+ for chunk in model.predict(data):
222
+ if chunk:
223
+ if chunk[1]:
224
+ output = chunk[0].strip()
225
+ output = post_process_code(output)
226
+ state.messages[-1][-1] = output + "▌"
227
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
228
+ else:
229
+ output = chunk[0].strip()
230
+ state.messages[-1][-1] = output
231
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
232
+ return
233
+ time.sleep(0.03)
234
+
235
+ except requests.exceptions.RequestException as e:
236
+ state.messages[-1][-1] = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
237
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
238
+ return
239
+
240
+ state.messages[-1][-1] = state.messages[-1][-1][:-1]
241
+ yield (state, state.to_gradio_chatbot(), "", None, None) + (enable_btn,) * 5
242
+
243
+ # [![Star on GitHub](https://img.shields.io/github/stars/X-PLUG/mPLUG-Owl.svg?style=social)](https://github.com/X-PLUG/mPLUG-Owl/stargazers)
244
+ # **If you are facing ERROR, it might be Out-Of-Memory (OOM) issue due to the limited GPU memory, please refresh the page to restart.** Besides, we recommand you to duplicate the space with a single A10 GPU to have a better experience. Or you can visit our demo hosted on [Modelscope](https://www.modelscope.cn/studios/damo/mPLUG-Owl/summary) which is hosted on a V100 machine.
245
+
246
+ title_markdown = ("""
247
+ <h1 align="center"><a href="https://github.com/X-PLUG/mPLUG-Owl"><img src="https://s1.ax1x.com/2023/05/12/p9yGA0g.png", alt="mPLUG-Owl" border="0" style="margin: 0 auto; height: 200px;" /></a> </h1>
248
+
249
+ <h2 align="center"> mPLUG-Owl🦉: Modularization Empowers Large Language Models with Multimodality </h2>
250
+
251
+ <h5 align="center"> If you like our project, please give us a star ✨ on Github for latest update. </h2>
252
+
253
+ <div align="center">
254
+ <div style="display:flex; gap: 0.25rem;" align="center">
255
+ <a href='https://github.com/X-PLUG/mPLUG-Owl'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
256
+ <a href="https://arxiv.org/abs/2304.14178"><img src="https://img.shields.io/badge/Arxiv-2304.14178-red"></a>
257
+ <a href='https://github.com/X-PLUG/mPLUG-Owl/stargazers'><img src='https://img.shields.io/github/stars/X-PLUG/mPLUG-Owl.svg?style=social'></a>
258
+ </div>
259
+ </div>
260
+
261
+ **Notice**: The output is generated by top-k sampling scheme and may involve some randomness. For multiple images and video, we cannot ensure its performance since only image-text / video-text pairs are used during training.
262
+
263
+ **We recommend only one image or video per conversation session.** If you want to start chatting with new images or videos, we recommend you to **CLEAR** the history to restart.
264
+
265
+ """)
266
+
267
+ tos_markdown = ("""
268
+ ### Terms of use
269
+ By using this service, users are required to agree to the following terms:
270
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
271
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
272
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
273
+
274
+ **Copyright 2023 Alibaba DAMO Academy.**
275
+ """)
276
+
277
+ learn_more_markdown = ("""
278
+ ### License
279
+ The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
280
+ """)
281
+
282
+ css = code_highlight_css + """
283
+ pre {
284
+ white-space: pre-wrap; /* Since CSS 2.1 */
285
+ white-space: -moz-pre-wrap; /* Mozilla, since 1999 */
286
+ white-space: -pre-wrap; /* Opera 4-6 */
287
+ white-space: -o-pre-wrap; /* Opera 7 */
288
+ word-wrap: break-word; /* Internet Explorer 5.5+ */
289
+ }
290
+ """
291
+
292
+ def build_demo():
293
+ # with gr.Blocks(title="mPLUG-Owl🦉", theme=gr.themes.Base(), css=css) as demo:
294
+ with gr.Blocks(title="mPLUG-Owl🦉", css=css) as demo:
295
+ state = gr.State()
296
+ gr.Markdown(SHARED_UI_WARNING)
297
+
298
+ gr.Markdown(title_markdown)
299
+
300
+ with gr.Row():
301
+ with gr.Column(scale=3):
302
+
303
+ imagebox = gr.Image(type="pil")
304
+ videobox = gr.Video()
305
+
306
+ with gr.Accordion("Parameters", open=True, visible=False) as parameter_row:
307
+ max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
308
+ temperature = gr.Slider(minimum=0, maximum=1, value=1, step=0.1, interactive=True, label="Temperature",)
309
+ top_k = gr.Slider(minimum=1, maximum=5, value=3, step=1, interactive=True, label="Top K",)
310
+ top_p = gr.Slider(minimum=0, maximum=1, value=0.9, step=0.1, interactive=True, label="Top p",)
311
+ length_penalty = gr.Slider(minimum=1, maximum=5, value=1, step=0.1, interactive=True, label="length_penalty",)
312
+ num_beams = gr.Slider(minimum=1, maximum=5, value=1, step=1, interactive=True, label="Beam Size",)
313
+ no_repeat_ngram_size = gr.Slider(minimum=1, maximum=5, value=2, step=1, interactive=True, label="no_repeat_ngram_size",)
314
+ num_frames = gr.Slider(minimum=8, maximum=32, value=8, step=4, interactive=True, label="Number of Frames",)
315
+ do_sample = gr.Checkbox(interactive=True, value=True, label="do_sample")
316
+
317
+ gr.Markdown(tos_markdown)
318
+
319
+ with gr.Column(scale=6):
320
+ chatbot = grChatbot(elem_id="chatbot", visible=False).style(height=1000)
321
+ with gr.Row():
322
+ with gr.Column(scale=8):
323
+ textbox = gr.Textbox(show_label=False,
324
+ placeholder="Enter text and press ENTER", visible=False).style(container=False)
325
+ with gr.Column(scale=1, min_width=60):
326
+ submit_btn = gr.Button(value="Submit", visible=False)
327
+ with gr.Row(visible=False) as button_row:
328
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=False)
329
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=False)
330
+ flag_btn = gr.Button(value="⚠️ Flag", interactive=False)
331
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False)
332
+ clear_btn = gr.Button(value="🗑️ Clear history", interactive=False)
333
+
334
+ gr.Examples(examples=[
335
+ [f"examples/monday.jpg", "Explain why this meme is funny."],
336
+ [f'examples/rap.jpeg', 'Can you write me a master rap song that rhymes very well based on this image?'],
337
+ [f'examples/titanic.jpeg', 'What happened at the end of this movie?'],
338
+ [f'examples/vga.jpeg', 'What is funny about this image? Describe it panel by panel.'],
339
+ [f'examples/mug_ad.jpeg', 'We design new mugs shown in the image. Can you help us write an advertisement?'],
340
+ [f'examples/laundry.jpeg', 'Why this happens and how to fix it?'],
341
+ [f'examples/ca.jpeg', "What do you think about the person's behavior?"],
342
+ [f'examples/monalisa-fun.jpg', 'Do you know who drew this painting?​'],
343
+ ], inputs=[imagebox, textbox])
344
+
345
+ gr.Markdown(learn_more_markdown)
346
+ url_params = gr.JSON(visible=False)
347
+
348
+ btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
349
+ parameter_list = [
350
+ num_frames, max_output_tokens, temperature, top_k, top_p,
351
+ num_beams, no_repeat_ngram_size, length_penalty,
352
+ do_sample
353
+ ]
354
+ upvote_btn.click(upvote_last_response,
355
+ [state], [textbox, upvote_btn, downvote_btn, flag_btn])
356
+ downvote_btn.click(downvote_last_response,
357
+ [state], [textbox, upvote_btn, downvote_btn, flag_btn])
358
+ flag_btn.click(flag_last_response,
359
+ [state], [textbox, upvote_btn, downvote_btn, flag_btn])
360
+ # regenerate_btn.click(regenerate, state,
361
+ # [state, chatbot, textbox, imagebox, videobox] + btn_list).then(
362
+ # http_bot, [state] + parameter_list,
363
+ # [state, chatbot] + btn_list)
364
+ regenerate_btn.click(regenerate_http_bot, [state] + parameter_list,
365
+ [state, chatbot, textbox, imagebox, videobox] + btn_list)
366
+
367
+ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, videobox] + btn_list)
368
+
369
+ # textbox.submit(add_text, [state, textbox, imagebox, videobox], [state, chatbot, textbox, imagebox, videobox] + btn_list
370
+ # ).then(http_bot, [state] + parameter_list,
371
+ # [state, chatbot] + btn_list)
372
+ # submit_btn.click(add_text, [state, textbox, imagebox, videobox], [state, chatbot, textbox, imagebox, videobox] + btn_list
373
+ # ).then(http_bot, [state] + parameter_list,
374
+ # [state, chatbot] + btn_list)
375
+
376
+ textbox.submit(add_text_http_bot,
377
+ [state, textbox, imagebox, videobox] + parameter_list,
378
+ [state, chatbot, textbox, imagebox, videobox] + btn_list
379
+ )
380
+
381
+ submit_btn.click(add_text_http_bot,
382
+ [state, textbox, imagebox, videobox] + parameter_list,
383
+ [state, chatbot, textbox, imagebox, videobox] + btn_list
384
+ )
385
+
386
+ demo.load(load_demo, [url_params], [state,
387
+ chatbot, textbox, submit_btn, button_row, parameter_row],
388
+ _js=get_window_url_params)
389
+
390
+ return demo
391
+
392
+ if __name__ == "__main__":
393
+ io = init()
394
+
395
+ parser = argparse.ArgumentParser()
396
+ parser.add_argument("--host", type=str, default="0.0.0.0")
397
+ parser.add_argument("--debug", action="store_true", help="using debug mode")
398
+ parser.add_argument("--port", type=int)
399
+ parser.add_argument("--concurrency-count", type=int, default=1)
400
+ parser.add_argument("--base-model",type=str, default='./')
401
+ parser.add_argument("--load-8bit", action="store_true", help="using 8bit mode")
402
+ parser.add_argument("--bf16", action="store_true", default=True, help="using 8bit mode")
403
+ args = parser.parse_args()
404
+
405
+ if torch.cuda.is_available():
406
+ device = "cuda"
407
+ else:
408
+ device = "cpu"
409
+
410
+ model = mPLUG_Owl_Server(
411
+ base_model=args.base_model,
412
+ load_in_8bit=args.load_8bit,
413
+ bf16=args.bf16,
414
+ device=device,
415
+ io=io
416
+ )
417
+ demo = build_demo()
418
+ demo.queue(concurrency_count=args.concurrency_count, status_update_rate=10, api_open=False).launch(server_name=args.host, debug=args.debug, server_port=args.port, share=False)
419
+
config.json ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "return_dict": true,
3
+ "output_hidden_states": false,
4
+ "output_attentions": false,
5
+ "torchscript": false,
6
+ "torch_dtype": null,
7
+ "use_bfloat16": false,
8
+ "tf_legacy_loss": false,
9
+ "pruned_heads": {},
10
+ "tie_word_embeddings": false,
11
+ "is_encoder_decoder": false,
12
+ "is_decoder": false,
13
+ "cross_attention_hidden_size": null,
14
+ "add_cross_attention": false,
15
+ "tie_encoder_decoder": false,
16
+ "max_length": 20,
17
+ "min_length": 0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "num_beams": 1,
21
+ "num_beam_groups": 1,
22
+ "diversity_penalty": 0.0,
23
+ "temperature": 1.0,
24
+ "top_k": 50,
25
+ "top_p": 1.0,
26
+ "typical_p": 1.0,
27
+ "repetition_penalty": 1.0,
28
+ "length_penalty": 1.0,
29
+ "no_repeat_ngram_size": 0,
30
+ "encoder_no_repeat_ngram_size": 0,
31
+ "bad_words_ids": null,
32
+ "num_return_sequences": 1,
33
+ "chunk_size_feed_forward": 0,
34
+ "output_scores": false,
35
+ "return_dict_in_generate": false,
36
+ "forced_bos_token_id": null,
37
+ "forced_eos_token_id": null,
38
+ "remove_invalid_values": false,
39
+ "exponential_decay_length_penalty": null,
40
+ "suppress_tokens": null,
41
+ "begin_suppress_tokens": null,
42
+ "architectures": null,
43
+ "finetuning_task": null,
44
+ "id2label": {
45
+ "0": "LABEL_0",
46
+ "1": "LABEL_1"
47
+ },
48
+ "label2id": {
49
+ "LABEL_0": 0,
50
+ "LABEL_1": 1
51
+ },
52
+ "tokenizer_class": null,
53
+ "prefix": null,
54
+ "bos_token_id": null,
55
+ "pad_token_id": null,
56
+ "eos_token_id": null,
57
+ "sep_token_id": null,
58
+ "decoder_start_token_id": null,
59
+ "task_specific_params": null,
60
+ "problem_type": null,
61
+ "_name_or_path": "",
62
+ "_commit_hash": null,
63
+ "transformers_version": null,
64
+ "vision_config": {
65
+ "return_dict": true,
66
+ "output_hidden_states": false,
67
+ "output_attentions": false,
68
+ "torchscript": false,
69
+ "torch_dtype": null,
70
+ "use_bfloat16": false,
71
+ "tf_legacy_loss": false,
72
+ "pruned_heads": {},
73
+ "tie_word_embeddings": true,
74
+ "is_encoder_decoder": false,
75
+ "is_decoder": false,
76
+ "cross_attention_hidden_size": null,
77
+ "add_cross_attention": false,
78
+ "tie_encoder_decoder": false,
79
+ "max_length": 20,
80
+ "min_length": 0,
81
+ "do_sample": false,
82
+ "early_stopping": false,
83
+ "num_beams": 1,
84
+ "num_beam_groups": 1,
85
+ "diversity_penalty": 0.0,
86
+ "temperature": 1.0,
87
+ "top_k": 50,
88
+ "top_p": 1.0,
89
+ "typical_p": 1.0,
90
+ "repetition_penalty": 1.0,
91
+ "length_penalty": 1.0,
92
+ "no_repeat_ngram_size": 0,
93
+ "encoder_no_repeat_ngram_size": 0,
94
+ "bad_words_ids": null,
95
+ "num_return_sequences": 1,
96
+ "chunk_size_feed_forward": 0,
97
+ "output_scores": false,
98
+ "return_dict_in_generate": false,
99
+ "forced_bos_token_id": null,
100
+ "forced_eos_token_id": null,
101
+ "remove_invalid_values": false,
102
+ "exponential_decay_length_penalty": null,
103
+ "suppress_tokens": null,
104
+ "begin_suppress_tokens": null,
105
+ "architectures": null,
106
+ "finetuning_task": null,
107
+ "id2label": {
108
+ "0": "LABEL_0",
109
+ "1": "LABEL_1"
110
+ },
111
+ "label2id": {
112
+ "LABEL_0": 0,
113
+ "LABEL_1": 1
114
+ },
115
+ "tokenizer_class": null,
116
+ "prefix": null,
117
+ "bos_token_id": null,
118
+ "pad_token_id": null,
119
+ "eos_token_id": null,
120
+ "sep_token_id": null,
121
+ "decoder_start_token_id": null,
122
+ "task_specific_params": null,
123
+ "problem_type": null,
124
+ "_name_or_path": "",
125
+ "transformers_version": "4.29.0.dev0",
126
+ "model_type": "mplug_owl_vision_model",
127
+ "hidden_size": 1024,
128
+ "intermediate_size": 4096,
129
+ "projection_dim": 768,
130
+ "num_hidden_layers": 24,
131
+ "num_attention_heads": 16,
132
+ "num_channels": 3,
133
+ "patch_size": 14,
134
+ "image_size": 224,
135
+ "initializer_range": 0.02,
136
+ "initializer_factor": 1.0,
137
+ "attention_dropout": 0.0,
138
+ "layer_norm_eps": 1e-06,
139
+ "hidden_act": "quick_gelu"
140
+ },
141
+ "visual_abstractor_config": {
142
+ "return_dict": true,
143
+ "output_hidden_states": false,
144
+ "output_attentions": false,
145
+ "torchscript": false,
146
+ "torch_dtype": null,
147
+ "use_bfloat16": false,
148
+ "tf_legacy_loss": false,
149
+ "pruned_heads": {},
150
+ "tie_word_embeddings": true,
151
+ "is_encoder_decoder": false,
152
+ "is_decoder": false,
153
+ "cross_attention_hidden_size": null,
154
+ "add_cross_attention": false,
155
+ "tie_encoder_decoder": false,
156
+ "max_length": 20,
157
+ "min_length": 0,
158
+ "do_sample": false,
159
+ "early_stopping": false,
160
+ "num_beams": 1,
161
+ "num_beam_groups": 1,
162
+ "diversity_penalty": 0.0,
163
+ "temperature": 1.0,
164
+ "top_k": 50,
165
+ "top_p": 1.0,
166
+ "typical_p": 1.0,
167
+ "repetition_penalty": 1.0,
168
+ "length_penalty": 1.0,
169
+ "no_repeat_ngram_size": 0,
170
+ "encoder_no_repeat_ngram_size": 0,
171
+ "bad_words_ids": null,
172
+ "num_return_sequences": 1,
173
+ "chunk_size_feed_forward": 0,
174
+ "output_scores": false,
175
+ "return_dict_in_generate": false,
176
+ "forced_bos_token_id": null,
177
+ "forced_eos_token_id": null,
178
+ "remove_invalid_values": false,
179
+ "exponential_decay_length_penalty": null,
180
+ "suppress_tokens": null,
181
+ "begin_suppress_tokens": null,
182
+ "architectures": null,
183
+ "finetuning_task": null,
184
+ "id2label": {
185
+ "0": "LABEL_0",
186
+ "1": "LABEL_1"
187
+ },
188
+ "label2id": {
189
+ "LABEL_0": 0,
190
+ "LABEL_1": 1
191
+ },
192
+ "tokenizer_class": null,
193
+ "prefix": null,
194
+ "bos_token_id": null,
195
+ "pad_token_id": 0,
196
+ "eos_token_id": null,
197
+ "sep_token_id": null,
198
+ "decoder_start_token_id": null,
199
+ "task_specific_params": null,
200
+ "problem_type": null,
201
+ "_name_or_path": "",
202
+ "transformers_version": "4.29.0.dev0",
203
+ "vocab_size": 30522,
204
+ "hidden_size": 1024,
205
+ "num_hidden_layers": 6,
206
+ "num_attention_heads": 16,
207
+ "hidden_act": "gelu",
208
+ "intermediate_size": 4096,
209
+ "hidden_dropout_prob": 0.1,
210
+ "attention_probs_dropout_prob": 0.1,
211
+ "max_position_embeddings": 512,
212
+ "initializer_range": 0.02,
213
+ "layer_norm_eps": 1e-06,
214
+ "position_embedding_type": "absolute",
215
+ "classifier_dropout": null,
216
+ "cross_attention_frequency": 2,
217
+ "encoder_hidden_size": 1024,
218
+ "model_type": "MPlugOwlVisualAbstractor"
219
+ },
220
+ "text_config": {
221
+ "vocab_size": 32000,
222
+ "max_position_embeddings": 2048,
223
+ "hidden_size": 4096,
224
+ "intermediate_size": 11008,
225
+ "num_hidden_layers": 32,
226
+ "num_attention_heads": 32,
227
+ "hidden_act": "silu",
228
+ "initializer_range": 0.02,
229
+ "rms_norm_eps": 1e-06,
230
+ "use_cache": true,
231
+ "return_dict": true,
232
+ "output_hidden_states": false,
233
+ "output_attentions": false,
234
+ "torchscript": false,
235
+ "torch_dtype": null,
236
+ "use_bfloat16": false,
237
+ "tf_legacy_loss": false,
238
+ "pruned_heads": {},
239
+ "tie_word_embeddings": false,
240
+ "is_encoder_decoder": false,
241
+ "is_decoder": false,
242
+ "cross_attention_hidden_size": null,
243
+ "add_cross_attention": false,
244
+ "tie_encoder_decoder": false,
245
+ "max_length": 20,
246
+ "min_length": 0,
247
+ "do_sample": false,
248
+ "early_stopping": false,
249
+ "num_beams": 1,
250
+ "num_beam_groups": 1,
251
+ "diversity_penalty": 0.0,
252
+ "temperature": 1.0,
253
+ "top_k": 50,
254
+ "top_p": 1.0,
255
+ "typical_p": 1.0,
256
+ "repetition_penalty": 1.0,
257
+ "length_penalty": 1.0,
258
+ "no_repeat_ngram_size": 0,
259
+ "encoder_no_repeat_ngram_size": 0,
260
+ "bad_words_ids": null,
261
+ "num_return_sequences": 1,
262
+ "chunk_size_feed_forward": 0,
263
+ "output_scores": false,
264
+ "return_dict_in_generate": false,
265
+ "forced_bos_token_id": null,
266
+ "forced_eos_token_id": null,
267
+ "remove_invalid_values": false,
268
+ "exponential_decay_length_penalty": null,
269
+ "suppress_tokens": null,
270
+ "begin_suppress_tokens": null,
271
+ "architectures": null,
272
+ "finetuning_task": null,
273
+ "id2label": {
274
+ "0": "LABEL_0",
275
+ "1": "LABEL_1"
276
+ },
277
+ "label2id": {
278
+ "LABEL_0": 0,
279
+ "LABEL_1": 1
280
+ },
281
+ "tokenizer_class": null,
282
+ "prefix": null,
283
+ "bos_token_id": 1,
284
+ "pad_token_id": 2,
285
+ "eos_token_id": 2,
286
+ "sep_token_id": null,
287
+ "decoder_start_token_id": null,
288
+ "task_specific_params": null,
289
+ "problem_type": null,
290
+ "_name_or_path": "",
291
+ "transformers_version": "4.29.0.dev0",
292
+ "model_type": "llama"
293
+ },
294
+ "num_query_tokens": 64,
295
+ "use_decoder_only_language_model": true,
296
+ "initializer_factor": 1.0,
297
+ "initializer_range": 0.02,
298
+ "model_type": "mplug-owl"
299
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resolution": 224,
3
+ "image_mean": [
4
+ 0.48145466, 0.4578275, 0.40821073
5
+ ],
6
+ "image_std": [
7
+ 0.26862954, 0.26130258, 0.27577711
8
+ ]
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bf9c821075524e0ef7692552814a67de82c112d7341a2e7d6603ec51a13421b
3
+ size 14305584703
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ninja
2
+ sentencepiece
3
+ icecream
4
+ transformers==4.28.1
5
+ tqdm
6
+ decord==0.6.0
7
+ timm==0.6.7
8
+ oss2
9
+ markdown2
10
+ hjson
11
+ einops
12
+ wget
13
+ accelerate
14
+ gradio==3.20.1
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "model_max_length": 1000000000000000019884624838656, "tokenizer_class": "MplugOwlTokenizer", "unk_token": "<unk>", "pad_token": "<unk>"}