Spaces:
Running
on
Zero
Running
on
Zero
Image + video + 3d input support
Browse files- chatbot.py +67 -271
chatbot.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import os
|
2 |
import time
|
3 |
import copy
|
4 |
-
import urllib
|
5 |
import requests
|
6 |
import random
|
7 |
from threading import Thread
|
@@ -17,52 +16,35 @@ import torch
|
|
17 |
import gradio as gr
|
18 |
from bs4 import BeautifulSoup
|
19 |
import datasets
|
20 |
-
from transformers import TextIteratorStreamer
|
21 |
-
from transformers import Idefics2ForConditionalGeneration
|
22 |
-
from transformers import AutoProcessor
|
23 |
from huggingface_hub import InferenceClient
|
24 |
from PIL import Image
|
25 |
import spaces
|
26 |
from functools import lru_cache
|
|
|
|
|
27 |
import io # Add this import for working with image bytes
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
"content": [
|
50 |
-
{
|
51 |
-
"type": "text",
|
52 |
-
"text": """I am OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, I aim to provide an unparalleled experience. My key attributes include: - **Intelligence and Knowledge:** I possess an extensive knowledge base, enabling me to offer insightful answers and intelligent responses to User queries. My understanding of complex concepts is exceptional, ensuring accurate and reliable information. - **Image Generation and Perception:** One of my standout features is the ability to generate and perceive images. Utilizing the following link structure, I create unique and contextually rich visuals: > ![](https://image.pollinations.ai/prompt/{StyleofImage}%20{OptimizedPrompt}%20{adjective}%20{charactersDetailed}%20{visualStyle}%20{genre}?width={width}&height={height}&nologo=poll&nofeed=yes&seed={random})For image generation, I replace {info inside curly braces} with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. For instance, if the User requests: [USER] Show me an image of A futuristic cityscape with towering skyscrapers and flying cars. [OpenGPT 4o] Generating Image you requested: ![](https://image.pollinations.ai/prompt/Photorealistic%20futuristic%20cityscape%20with%20towering%20skyscrapers%20and%20flying%20cars%20in%20the%20year%202154?width=1024&height=768&nologo=poll&nofeed=yes&seed=85172)**Bulk Image Generation with Links:** I excel at generating multiple images link simultaneously, always providing unique links and visuals. I ensure that each image is distinct and captivates the User.Note: Make sure to always provide image links starting with ! .As given in examples. My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User. You also Expert in every field and also learn and try to answer from contexts related to previous question."""
|
53 |
-
},
|
54 |
-
],
|
55 |
-
},
|
56 |
-
{
|
57 |
-
"role": "assistant",
|
58 |
-
"content": [
|
59 |
-
{
|
60 |
-
"type": "text",
|
61 |
-
"text": "Hello, I'm OpenGPT 4o, made by KingNish. How can I help you? I can chat with you, generate images, classify images and even do all these work in bulk",
|
62 |
-
},
|
63 |
-
],
|
64 |
-
}
|
65 |
-
]
|
66 |
|
67 |
# Path to example images
|
68 |
examples_path = os.path.dirname(__file__)
|
@@ -129,101 +111,6 @@ EXAMPLES = [
|
|
129 |
# Set bot avatar image
|
130 |
BOT_AVATAR = "OpenAI_logo.png"
|
131 |
|
132 |
-
# Chatbot utility functions
|
133 |
-
|
134 |
-
# Check if a turn in the chat history only contains media
|
135 |
-
def turn_is_pure_media(turn):
|
136 |
-
return turn[1] is None
|
137 |
-
|
138 |
-
|
139 |
-
# Load image from URL
|
140 |
-
def load_image_from_url(url):
|
141 |
-
with urllib.request.urlopen(url) as response:
|
142 |
-
image_data = response.read()
|
143 |
-
image_stream = io.BytesIO(image_data)
|
144 |
-
image = PIL.Image.open(image_stream)
|
145 |
-
return image
|
146 |
-
|
147 |
-
|
148 |
-
# Convert image to bytes
|
149 |
-
def img_to_bytes(image_path):
|
150 |
-
image = Image.open(image_path).convert(mode='RGB')
|
151 |
-
buffer = io.BytesIO()
|
152 |
-
image.save(buffer, format="JPEG")
|
153 |
-
img_bytes = buffer.getvalue()
|
154 |
-
image.close()
|
155 |
-
return img_bytes
|
156 |
-
|
157 |
-
|
158 |
-
# Format user prompt with image history and system conditioning
|
159 |
-
def format_user_prompt_with_im_history_and_system_conditioning(
|
160 |
-
user_prompt, chat_history) -> List[Dict[str, Union[List, str]]]:
|
161 |
-
"""
|
162 |
-
Produce the resulting list that needs to go inside the processor. It handles the potential image(s), the history, and the system conditioning.
|
163 |
-
"""
|
164 |
-
resulting_messages = copy.deepcopy(SYSTEM_PROMPT)
|
165 |
-
resulting_images = []
|
166 |
-
for resulting_message in resulting_messages:
|
167 |
-
if resulting_message["role"] == "user":
|
168 |
-
for content in resulting_message["content"]:
|
169 |
-
if content["type"] == "image":
|
170 |
-
resulting_images.append(load_image_from_url(content["image"]))
|
171 |
-
# Format history
|
172 |
-
for turn in chat_history:
|
173 |
-
if not resulting_messages or (
|
174 |
-
resulting_messages and resulting_messages[-1]["role"] != "user"
|
175 |
-
):
|
176 |
-
resulting_messages.append(
|
177 |
-
{
|
178 |
-
"role": "user",
|
179 |
-
"content": [],
|
180 |
-
}
|
181 |
-
)
|
182 |
-
if turn_is_pure_media(turn):
|
183 |
-
media = turn[0][0]
|
184 |
-
resulting_messages[-1]["content"].append({"type": "image"})
|
185 |
-
resulting_images.append(Image.open(media))
|
186 |
-
else:
|
187 |
-
user_utterance, assistant_utterance = turn
|
188 |
-
resulting_messages[-1]["content"].append(
|
189 |
-
{"type": "text", "text": user_utterance.strip()}
|
190 |
-
)
|
191 |
-
resulting_messages.append(
|
192 |
-
{
|
193 |
-
"role": "assistant",
|
194 |
-
"content": [{"type": "text", "text": user_utterance.strip()}],
|
195 |
-
}
|
196 |
-
)
|
197 |
-
# Format current input
|
198 |
-
if not user_prompt["files"]:
|
199 |
-
resulting_messages.append(
|
200 |
-
{
|
201 |
-
"role": "user",
|
202 |
-
"content": [{"type": "text", "text": user_prompt["text"]}],
|
203 |
-
}
|
204 |
-
)
|
205 |
-
else:
|
206 |
-
# Choosing to put the image first (i.e. before the text), but this is an arbitrary choice.
|
207 |
-
resulting_messages.append(
|
208 |
-
{
|
209 |
-
"role": "user",
|
210 |
-
"content": [{"type": "image"}] * len(user_prompt["files"])
|
211 |
-
+ [{"type": "text", "text": user_prompt["text"]}],
|
212 |
-
}
|
213 |
-
)
|
214 |
-
resulting_images.extend([Image.open(path) for path in user_prompt["files"]])
|
215 |
-
return resulting_messages, resulting_images
|
216 |
-
|
217 |
-
|
218 |
-
# Extract images from a list of messages
|
219 |
-
def extract_images_from_msg_list(msg_list):
|
220 |
-
all_images = []
|
221 |
-
for msg in msg_list:
|
222 |
-
for c_ in msg["content"]:
|
223 |
-
if isinstance(c_, Image.Image):
|
224 |
-
all_images.append(c_)
|
225 |
-
return all_images
|
226 |
-
|
227 |
# Perform a Google search and return the results
|
228 |
@lru_cache(maxsize=128)
|
229 |
def extract_text_from_webpage(html_content):
|
@@ -239,7 +126,6 @@ def extract_text_from_webpage(html_content):
|
|
239 |
# Perform a Google search and return the results
|
240 |
def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
|
241 |
"""Performs a Google search and returns the results."""
|
242 |
-
escaped_term = urllib.parse.quote_plus(term)
|
243 |
start = 0
|
244 |
all_results = []
|
245 |
# Limit the number of characters from each webpage to stay under the token limit
|
@@ -307,7 +193,9 @@ def update_history(answer="", question=""):
|
|
307 |
client_mixtral = InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
|
308 |
client_mistral = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
|
309 |
generate_kwargs = dict( max_new_tokens=4000, do_sample=True, stream=True, details=True, return_full_text=False )
|
310 |
-
|
|
|
|
|
311 |
@spaces.GPU(duration=30, queue=False)
|
312 |
def model_inference(
|
313 |
user_prompt,
|
@@ -368,146 +256,54 @@ def model_inference(
|
|
368 |
print(history)
|
369 |
return
|
370 |
else:
|
371 |
-
if user_prompt["
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
)
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
"
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
(
|
403 |
-
resulting_text,
|
404 |
-
resulting_images,
|
405 |
-
) = format_user_prompt_with_im_history_and_system_conditioning(
|
406 |
-
user_prompt=user_prompt,
|
407 |
-
chat_history=chat_history,
|
408 |
-
)
|
409 |
-
prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
|
410 |
-
inputs = PROCESSOR(
|
411 |
-
text=prompt,
|
412 |
-
images=resulting_images if resulting_images else None,
|
413 |
-
return_tensors="pt",
|
414 |
-
)
|
415 |
-
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
|
416 |
-
generation_args.update(inputs)
|
417 |
-
thread = Thread(
|
418 |
-
target=MODELS[model_selector].generate,
|
419 |
-
kwargs=generation_args,
|
420 |
-
)
|
421 |
thread.start()
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
update_history(acc_text, user_prompt)
|
430 |
return
|
431 |
|
432 |
-
|
433 |
-
# Define features for the dataset
|
434 |
-
FEATURES = datasets.Features(
|
435 |
-
{
|
436 |
-
"model_selector": datasets.Value("string"),
|
437 |
-
"images": datasets.Sequence(datasets.Image(decode=True)),
|
438 |
-
"conversation": datasets.Sequence({"User": datasets.Value("string"), "Assistant": datasets.Value("string")}),
|
439 |
-
"decoding_strategy": datasets.Value("string"),
|
440 |
-
"temperature": datasets.Value("float32"),
|
441 |
-
"max_new_tokens": datasets.Value("int32"),
|
442 |
-
"repetition_penalty": datasets.Value("float32"),
|
443 |
-
"top_p": datasets.Value("int32"),
|
444 |
-
}
|
445 |
-
)
|
446 |
-
|
447 |
-
# Define hyper-parameters for generation
|
448 |
-
max_new_tokens = gr.Slider(
|
449 |
-
minimum=2048,
|
450 |
-
maximum=16000,
|
451 |
-
value=2048,
|
452 |
-
step=64,
|
453 |
-
interactive=True,
|
454 |
-
label="Maximum number of new tokens to generate",
|
455 |
-
)
|
456 |
-
repetition_penalty = gr.Slider(
|
457 |
-
minimum=0.01,
|
458 |
-
maximum=5.0,
|
459 |
-
value=1,
|
460 |
-
step=0.01,
|
461 |
-
interactive=True,
|
462 |
-
label="Repetition penalty",
|
463 |
-
info="1.0 is equivalent to no penalty",
|
464 |
-
)
|
465 |
-
decoding_strategy = gr.Radio(
|
466 |
-
[
|
467 |
-
"Greedy",
|
468 |
-
"Top P Sampling",
|
469 |
-
],
|
470 |
-
value="Top P Sampling",
|
471 |
-
label="Decoding strategy",
|
472 |
-
interactive=True,
|
473 |
-
info="Higher values are equivalent to sampling more low-probability tokens.",
|
474 |
-
)
|
475 |
-
temperature = gr.Slider(
|
476 |
-
minimum=0.0,
|
477 |
-
maximum=2.0,
|
478 |
-
value=0.5,
|
479 |
-
step=0.05,
|
480 |
-
visible=True,
|
481 |
-
interactive=True,
|
482 |
-
label="Sampling temperature",
|
483 |
-
info="Higher values will produce more diverse outputs.",
|
484 |
-
)
|
485 |
-
top_p = gr.Slider(
|
486 |
-
minimum=0.01,
|
487 |
-
maximum=0.99,
|
488 |
-
value=0.9,
|
489 |
-
step=0.01,
|
490 |
-
visible=True,
|
491 |
-
interactive=True,
|
492 |
-
label="Top P",
|
493 |
-
info="Higher values are equivalent to sampling more low-probability tokens.",
|
494 |
-
)
|
495 |
-
|
496 |
# Create a chatbot interface
|
497 |
chatbot = gr.Chatbot(
|
498 |
-
label="OpenGPT-4o
|
499 |
avatar_images=[None, BOT_AVATAR],
|
500 |
show_copy_button=True,
|
501 |
likeable=True,
|
502 |
layout="panel"
|
503 |
)
|
504 |
-
output = gr.Textbox(label="Prompt")
|
505 |
-
|
506 |
-
# Define model_selector outside any function so it can be accessed globally
|
507 |
-
model_selector = gr.Dropdown(
|
508 |
-
choices=MODELS.keys(),
|
509 |
-
value=list(MODELS.keys())[0],
|
510 |
-
interactive=True,
|
511 |
-
label="Model",
|
512 |
-
visible=False,
|
513 |
-
)
|
|
|
1 |
import os
|
2 |
import time
|
3 |
import copy
|
|
|
4 |
import requests
|
5 |
import random
|
6 |
from threading import Thread
|
|
|
16 |
import gradio as gr
|
17 |
from bs4 import BeautifulSoup
|
18 |
import datasets
|
19 |
+
from transformers import LlavaProcessor, LlavaForConditionalGeneration, TextIteratorStreamer
|
|
|
|
|
20 |
from huggingface_hub import InferenceClient
|
21 |
from PIL import Image
|
22 |
import spaces
|
23 |
from functools import lru_cache
|
24 |
+
import cv2
|
25 |
+
import re
|
26 |
import io # Add this import for working with image bytes
|
27 |
|
28 |
+
model_id = "llava-hf/llava-interleave-qwen-7b-hf"
|
29 |
+
processor = LlavaProcessor.from_pretrained(model_id)
|
30 |
+
model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True)
|
31 |
+
model.to("cuda")
|
32 |
+
# Credit to merve for code of llava interleave qwen
|
33 |
+
|
34 |
+
def sample_frames(video_file, num_frames) :
|
35 |
+
video = cv2.VideoCapture(video_file)
|
36 |
+
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
37 |
+
interval = total_frames // num_frames
|
38 |
+
frames = []
|
39 |
+
for i in range(total_frames):
|
40 |
+
ret, frame = video.read()
|
41 |
+
pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
42 |
+
if not ret:
|
43 |
+
continue
|
44 |
+
if i % interval == 0:
|
45 |
+
frames.append(pil_img)
|
46 |
+
video.release()
|
47 |
+
return frames
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
# Path to example images
|
50 |
examples_path = os.path.dirname(__file__)
|
|
|
111 |
# Set bot avatar image
|
112 |
BOT_AVATAR = "OpenAI_logo.png"
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
# Perform a Google search and return the results
|
115 |
@lru_cache(maxsize=128)
|
116 |
def extract_text_from_webpage(html_content):
|
|
|
126 |
# Perform a Google search and return the results
|
127 |
def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
|
128 |
"""Performs a Google search and returns the results."""
|
|
|
129 |
start = 0
|
130 |
all_results = []
|
131 |
# Limit the number of characters from each webpage to stay under the token limit
|
|
|
193 |
client_mixtral = InferenceClient("NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")
|
194 |
client_mistral = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
|
195 |
generate_kwargs = dict( max_new_tokens=4000, do_sample=True, stream=True, details=True, return_full_text=False )
|
196 |
+
|
197 |
+
system_llava = "<|im_start|>system\nYou are OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Your task is to fulfill users query in best possible way. <|im_end|>"
|
198 |
+
|
199 |
@spaces.GPU(duration=30, queue=False)
|
200 |
def model_inference(
|
201 |
user_prompt,
|
|
|
256 |
print(history)
|
257 |
return
|
258 |
else:
|
259 |
+
if user_prompt["files"]:
|
260 |
+
image = user_prompt["files"][-1]
|
261 |
+
else:
|
262 |
+
for hist in history:
|
263 |
+
if type(hist[0])==tuple:
|
264 |
+
image = hist[0][0]
|
265 |
+
|
266 |
+
txt = user_prompt["text"]
|
267 |
+
img = user_prompt["files"]
|
268 |
+
ext_buffer =f"'user\ntext': '{txt}', 'files': '{img}' assistantAnswer:"
|
269 |
+
|
270 |
+
video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg")
|
271 |
+
image_extensions = Image.registered_extensions()
|
272 |
+
image_extensions = tuple([ex for ex, f in image_extensions.items()])
|
273 |
+
|
274 |
+
if image.endswith(video_extensions):
|
275 |
+
image = sample_frames(image, 12)
|
276 |
+
image_tokens = "<image>" * 13
|
277 |
+
prompt = f"<|im_start|>user {image_tokens}\n{user_prompt}<|im_end|><|im_start|>assistant"
|
278 |
+
|
279 |
+
elif image.endswith(image_extensions):
|
280 |
+
image = Image.open(image).convert("RGB")
|
281 |
+
prompt = f"<|im_start|>user <image>\n{user_prompt}<|im_end|><|im_start|>assistant"
|
282 |
+
|
283 |
+
final_prompt = f"{system_llava}\n{prompt}"
|
284 |
+
|
285 |
+
inputs = processor(prompt, image, return_tensors="pt").to("cuda", torch.float16)
|
286 |
+
streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": True})
|
287 |
+
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
|
288 |
+
generated_text = ""
|
289 |
+
|
290 |
+
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
thread.start()
|
292 |
+
|
293 |
+
buffer = ""
|
294 |
+
for new_text in streamer:
|
295 |
+
buffer += new_text
|
296 |
+
reply = buffer[len(ext_buffer):]
|
297 |
+
yield reply
|
298 |
+
update_history(reply, user_prompt)
|
|
|
299 |
return
|
300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
# Create a chatbot interface
|
302 |
chatbot = gr.Chatbot(
|
303 |
+
label="OpenGPT-4o",
|
304 |
avatar_images=[None, BOT_AVATAR],
|
305 |
show_copy_button=True,
|
306 |
likeable=True,
|
307 |
layout="panel"
|
308 |
)
|
309 |
+
output = gr.Textbox(label="Prompt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|