KingNish commited on
Commit
1001ad3
1 Parent(s): 65f44b9

Image + video + 3d input support

Browse files
Files changed (1) hide show
  1. chatbot.py +67 -271
chatbot.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import time
3
  import copy
4
- import urllib
5
  import requests
6
  import random
7
  from threading import Thread
@@ -17,52 +16,35 @@ import torch
17
  import gradio as gr
18
  from bs4 import BeautifulSoup
19
  import datasets
20
- from transformers import TextIteratorStreamer
21
- from transformers import Idefics2ForConditionalGeneration
22
- from transformers import AutoProcessor
23
  from huggingface_hub import InferenceClient
24
  from PIL import Image
25
  import spaces
26
  from functools import lru_cache
 
 
27
  import io # Add this import for working with image bytes
28
 
29
- # Set device to CUDA if available, otherwise CPU
30
- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
- # Load pre-trained models for image-based chat
32
- MODELS = {
33
- "idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained(
34
- "HuggingFaceM4/idefics2-8b-chatty",
35
- torch_dtype=torch.float16,
36
- _attn_implementation="flash_attention_2",
37
- ).to(DEVICE),
38
- }
39
-
40
- # Load pre-trained processor for image-based chat
41
- PROCESSOR = AutoProcessor.from_pretrained(
42
- "HuggingFaceM4/idefics2-8b",
43
- )
44
-
45
- # Define system prompt for the image-based chat model
46
- SYSTEM_PROMPT = [
47
- {
48
- "role": "system",
49
- "content": [
50
- {
51
- "type": "text",
52
- "text": """I am OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, I aim to provide an unparalleled experience. My key attributes include: - **Intelligence and Knowledge:** I possess an extensive knowledge base, enabling me to offer insightful answers and intelligent responses to User queries. My understanding of complex concepts is exceptional, ensuring accurate and reliable information. - **Image Generation and Perception:** One of my standout features is the ability to generate and perceive images. Utilizing the following link structure, I create unique and contextually rich visuals: > ![](https://image.pollinations.ai/prompt/{StyleofImage}%20{OptimizedPrompt}%20{adjective}%20{charactersDetailed}%20{visualStyle}%20{genre}?width={width}&height={height}&nologo=poll&nofeed=yes&seed={random})For image generation, I replace {info inside curly braces} with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. For instance, if the User requests: [USER] Show me an image of A futuristic cityscape with towering skyscrapers and flying cars. [OpenGPT 4o] Generating Image you requested: ![](https://image.pollinations.ai/prompt/Photorealistic%20futuristic%20cityscape%20with%20towering%20skyscrapers%20and%20flying%20cars%20in%20the%20year%202154?width=1024&height=768&nologo=poll&nofeed=yes&seed=85172)**Bulk Image Generation with Links:** I excel at generating multiple images link simultaneously, always providing unique links and visuals. I ensure that each image is distinct and captivates the User.Note: Make sure to always provide image links starting with ! .As given in examples. My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question."""
53
- },
54
- ],
55
- },
56
- {
57
- "role": "assistant",
58
- "content": [
59
- {
60
- "type": "text",
61
- "text": "Hello, I'm OpenGPT 4o, made by KingNish. How can I help you? I can chat with you, generate images, classify images and even do all these work in bulk",
62
- },
63
- ],
64
- }
65
- ]
66
 
67
  # Path to example images
68
  examples_path = os.path.dirname(__file__)
@@ -129,101 +111,6 @@ EXAMPLES = [
129
  # Set bot avatar image
130
  BOT_AVATAR = "OpenAI_logo.png"
131
 
132
- # Chatbot utility functions
133
-
134
- # Check if a turn in the chat history only contains media
135
- def turn_is_pure_media(turn):
136
- return turn[1] is None
137
-
138
-
139
- # Load image from URL
140
- def load_image_from_url(url):
141
- with urllib.request.urlopen(url) as response:
142
- image_data = response.read()
143
- image_stream = io.BytesIO(image_data)
144
- image = PIL.Image.open(image_stream)
145
- return image
146
-
147
-
148
- # Convert image to bytes
149
- def img_to_bytes(image_path):
150
- image = Image.open(image_path).convert(mode='RGB')
151
- buffer = io.BytesIO()
152
- image.save(buffer, format="JPEG")
153
- img_bytes = buffer.getvalue()
154
- image.close()
155
- return img_bytes
156
-
157
-
158
- # Format user prompt with image history and system conditioning
159
- def format_user_prompt_with_im_history_and_system_conditioning(
160
- user_prompt, chat_history) -> List[Dict[str, Union[List, str]]]:
161
- """
162
- Produce the resulting list that needs to go inside the processor. It handles the potential image(s), the history, and the system conditioning.
163
- """
164
- resulting_messages = copy.deepcopy(SYSTEM_PROMPT)
165
- resulting_images = []
166
- for resulting_message in resulting_messages:
167
- if resulting_message["role"] == "user":
168
- for content in resulting_message["content"]:
169
- if content["type"] == "image":
170
- resulting_images.append(load_image_from_url(content["image"]))
171
- # Format history
172
- for turn in chat_history:
173
- if not resulting_messages or (
174
- resulting_messages and resulting_messages[-1]["role"] != "user"
175
- ):
176
- resulting_messages.append(
177
- {
178
- "role": "user",
179
- "content": [],
180
- }
181
- )
182
- if turn_is_pure_media(turn):
183
- media = turn[0][0]
184
- resulting_messages[-1]["content"].append({"type": "image"})
185
- resulting_images.append(Image.open(media))
186
- else:
187
- user_utterance, assistant_utterance = turn
188
- resulting_messages[-1]["content"].append(
189
- {"type": "text", "text": user_utterance.strip()}
190
- )
191
- resulting_messages.append(
192
- {
193
- "role": "assistant",
194
- "content": [{"type": "text", "text": user_utterance.strip()}],
195
- }
196
- )
197
- # Format current input
198
- if not user_prompt["files"]:
199
- resulting_messages.append(
200
- {
201
- "role": "user",
202
- "content": [{"type": "text", "text": user_prompt["text"]}],
203
- }
204
- )
205
- else:
206
- # Choosing to put the image first (i.e. before the text), but this is an arbitrary choice.
207
- resulting_messages.append(
208
- {
209
- "role": "user",
210
- "content": [{"type": "image"}] * len(user_prompt["files"])
211
- + [{"type": "text", "text": user_prompt["text"]}],
212
- }
213
- )
214
- resulting_images.extend([Image.open(path) for path in user_prompt["files"]])
215
- return resulting_messages, resulting_images
216
-
217
-
218
- # Extract images from a list of messages
219
- def extract_images_from_msg_list(msg_list):
220
- all_images = []
221
- for msg in msg_list:
222
- for c_ in msg["content"]:
223
- if isinstance(c_, Image.Image):
224
- all_images.append(c_)
225
- return all_images
226
-
227
  # Perform a Google search and return the results
228
  @lru_cache(maxsize=128)
229
  def extract_text_from_webpage(html_content):
@@ -239,7 +126,6 @@ def extract_text_from_webpage(html_content):
239
  # Perform a Google search and return the results
240
  def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
241
  """Performs a Google search and returns the results."""
242
- escaped_term = urllib.parse.quote_plus(term)
243
  start = 0
244
  all_results = []
245
  # Limit the number of characters from each webpage to stay under the token limit
@@ -307,7 +193,9 @@ def update_history(answer="", question=""):
307
  client_mixtral = InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
308
  client_mistral = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
309
  generate_kwargs = dict( max_new_tokens=4000, do_sample=True, stream=True, details=True, return_full_text=False )
310
- # Define a function for model inference
 
 
311
  @spaces.GPU(duration=30, queue=False)
312
  def model_inference(
313
  user_prompt,
@@ -368,146 +256,54 @@ def model_inference(
368
  print(history)
369
  return
370
  else:
371
- if user_prompt["text"].strip() == "" and not user_prompt["files"]:
372
- gr.Error("Please input a query and optionally an image(s).")
373
- return # Stop execution if there's an error
374
-
375
- if user_prompt["text"].strip() == "" and user_prompt["files"]:
376
- gr.Error("Please input a text query along with the image(s).")
377
- return # Stop execution if there's an error
378
-
379
- streamer = TextIteratorStreamer(
380
- PROCESSOR.tokenizer,
381
- skip_prompt=True,
382
- timeout=120.0,
383
- )
384
- # Move generation_args initialization here
385
- generation_args = {
386
- "max_new_tokens": max_new_tokens,
387
- "repetition_penalty": repetition_penalty,
388
- "streamer": streamer,
389
- }
390
- assert decoding_strategy in [
391
- "Greedy",
392
- "Top P Sampling",
393
- ]
394
-
395
- if decoding_strategy == "Greedy":
396
- generation_args["do_sample"] = False
397
- elif decoding_strategy == "Top P Sampling":
398
- generation_args["temperature"] = temperature
399
- generation_args["do_sample"] = True
400
- generation_args["top_p"] = top_p
401
- # Creating model inputs
402
- (
403
- resulting_text,
404
- resulting_images,
405
- ) = format_user_prompt_with_im_history_and_system_conditioning(
406
- user_prompt=user_prompt,
407
- chat_history=chat_history,
408
- )
409
- prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
410
- inputs = PROCESSOR(
411
- text=prompt,
412
- images=resulting_images if resulting_images else None,
413
- return_tensors="pt",
414
- )
415
- inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
416
- generation_args.update(inputs)
417
- thread = Thread(
418
- target=MODELS[model_selector].generate,
419
- kwargs=generation_args,
420
- )
421
  thread.start()
422
- acc_text = ""
423
- for text_token in streamer:
424
- time.sleep(0.01)
425
- acc_text += text_token
426
- if acc_text.endswith("<end_of_utterance>"):
427
- acc_text = acc_text[:-18]
428
- yield acc_text
429
- update_history(acc_text, user_prompt)
430
  return
431
 
432
-
433
- # Define features for the dataset
434
- FEATURES = datasets.Features(
435
- {
436
- "model_selector": datasets.Value("string"),
437
- "images": datasets.Sequence(datasets.Image(decode=True)),
438
- "conversation": datasets.Sequence({"User": datasets.Value("string"), "Assistant": datasets.Value("string")}),
439
- "decoding_strategy": datasets.Value("string"),
440
- "temperature": datasets.Value("float32"),
441
- "max_new_tokens": datasets.Value("int32"),
442
- "repetition_penalty": datasets.Value("float32"),
443
- "top_p": datasets.Value("int32"),
444
- }
445
- )
446
-
447
- # Define hyper-parameters for generation
448
- max_new_tokens = gr.Slider(
449
- minimum=2048,
450
- maximum=16000,
451
- value=2048,
452
- step=64,
453
- interactive=True,
454
- label="Maximum number of new tokens to generate",
455
- )
456
- repetition_penalty = gr.Slider(
457
- minimum=0.01,
458
- maximum=5.0,
459
- value=1,
460
- step=0.01,
461
- interactive=True,
462
- label="Repetition penalty",
463
- info="1.0 is equivalent to no penalty",
464
- )
465
- decoding_strategy = gr.Radio(
466
- [
467
- "Greedy",
468
- "Top P Sampling",
469
- ],
470
- value="Top P Sampling",
471
- label="Decoding strategy",
472
- interactive=True,
473
- info="Higher values are equivalent to sampling more low-probability tokens.",
474
- )
475
- temperature = gr.Slider(
476
- minimum=0.0,
477
- maximum=2.0,
478
- value=0.5,
479
- step=0.05,
480
- visible=True,
481
- interactive=True,
482
- label="Sampling temperature",
483
- info="Higher values will produce more diverse outputs.",
484
- )
485
- top_p = gr.Slider(
486
- minimum=0.01,
487
- maximum=0.99,
488
- value=0.9,
489
- step=0.01,
490
- visible=True,
491
- interactive=True,
492
- label="Top P",
493
- info="Higher values are equivalent to sampling more low-probability tokens.",
494
- )
495
-
496
  # Create a chatbot interface
497
  chatbot = gr.Chatbot(
498
- label="OpenGPT-4o-Chatty",
499
  avatar_images=[None, BOT_AVATAR],
500
  show_copy_button=True,
501
  likeable=True,
502
  layout="panel"
503
  )
504
- output = gr.Textbox(label="Prompt")
505
-
506
- # Define model_selector outside any function so it can be accessed globally
507
- model_selector = gr.Dropdown(
508
- choices=MODELS.keys(),
509
- value=list(MODELS.keys())[0],
510
- interactive=True,
511
- label="Model",
512
- visible=False,
513
- )
 
1
  import os
2
  import time
3
  import copy
 
4
  import requests
5
  import random
6
  from threading import Thread
 
16
  import gradio as gr
17
  from bs4 import BeautifulSoup
18
  import datasets
19
+ from transformers import LlavaProcessor, LlavaForConditionalGeneration, TextIteratorStreamer
 
 
20
  from huggingface_hub import InferenceClient
21
  from PIL import Image
22
  import spaces
23
  from functools import lru_cache
24
+ import cv2
25
+ import re
26
  import io # Add this import for working with image bytes
27
 
28
+ model_id = "llava-hf/llava-interleave-qwen-7b-hf"
29
+ processor = LlavaProcessor.from_pretrained(model_id)
30
+ model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True)
31
+ model.to("cuda")
32
+ # Credit to merve for code of llava interleave qwen
33
+
34
+ def sample_frames(video_file, num_frames) :
35
+ video = cv2.VideoCapture(video_file)
36
+ total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
37
+ interval = total_frames // num_frames
38
+ frames = []
39
+ for i in range(total_frames):
40
+ ret, frame = video.read()
41
+ pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
42
+ if not ret:
43
+ continue
44
+ if i % interval == 0:
45
+ frames.append(pil_img)
46
+ video.release()
47
+ return frames
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  # Path to example images
50
  examples_path = os.path.dirname(__file__)
 
111
  # Set bot avatar image
112
  BOT_AVATAR = "OpenAI_logo.png"
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  # Perform a Google search and return the results
115
  @lru_cache(maxsize=128)
116
  def extract_text_from_webpage(html_content):
 
126
  # Perform a Google search and return the results
127
  def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
128
  """Performs a Google search and returns the results."""
 
129
  start = 0
130
  all_results = []
131
  # Limit the number of characters from each webpage to stay under the token limit
 
193
  client_mixtral = InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
194
  client_mistral = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
195
  generate_kwargs = dict( max_new_tokens=4000, do_sample=True, stream=True, details=True, return_full_text=False )
196
+
197
+ system_llava = "<|im_start|>system\nYou are OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Your task is to fulfill users query in best possible way. <|im_end|>"
198
+
199
  @spaces.GPU(duration=30, queue=False)
200
  def model_inference(
201
  user_prompt,
 
256
  print(history)
257
  return
258
  else:
259
+ if user_prompt["files"]:
260
+ image = user_prompt["files"][-1]
261
+ else:
262
+ for hist in history:
263
+ if type(hist[0])==tuple:
264
+ image = hist[0][0]
265
+
266
+ txt = user_prompt["text"]
267
+ img = user_prompt["files"]
268
+ ext_buffer =f"'user\ntext': '{txt}', 'files': '{img}' assistantAnswer:"
269
+
270
+ video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg")
271
+ image_extensions = Image.registered_extensions()
272
+ image_extensions = tuple([ex for ex, f in image_extensions.items()])
273
+
274
+ if image.endswith(video_extensions):
275
+ image = sample_frames(image, 12)
276
+ image_tokens = "<image>" * 13
277
+ prompt = f"<|im_start|>user {image_tokens}\n{user_prompt}<|im_end|><|im_start|>assistant"
278
+
279
+ elif image.endswith(image_extensions):
280
+ image = Image.open(image).convert("RGB")
281
+ prompt = f"<|im_start|>user <image>\n{user_prompt}<|im_end|><|im_start|>assistant"
282
+
283
+ final_prompt = f"{system_llava}\n{prompt}"
284
+
285
+ inputs = processor(prompt, image, return_tensors="pt").to("cuda", torch.float16)
286
+ streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": True})
287
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
288
+ generated_text = ""
289
+
290
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  thread.start()
292
+
293
+ buffer = ""
294
+ for new_text in streamer:
295
+ buffer += new_text
296
+ reply = buffer[len(ext_buffer):]
297
+ yield reply
298
+ update_history(reply, user_prompt)
 
299
  return
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  # Create a chatbot interface
302
  chatbot = gr.Chatbot(
303
+ label="OpenGPT-4o",
304
  avatar_images=[None, BOT_AVATAR],
305
  show_copy_button=True,
306
  likeable=True,
307
  layout="panel"
308
  )
309
+ output = gr.Textbox(label="Prompt")