KingNish commited on
Commit
5865d17
1 Parent(s): 76c17eb

Update chatbot.py

Browse files
Files changed (1) hide show
  1. chatbot.py +73 -47
chatbot.py CHANGED
@@ -4,16 +4,18 @@ import requests
4
  import random
5
  from threading import Thread
6
  from typing import List, Dict, Union
7
- import subprocess
8
- subprocess.run(
9
- "pip install flash-attn --no-build-isolation",
10
- env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
11
- shell=True,
12
- )
13
  import torch
14
  import gradio as gr
15
  from bs4 import BeautifulSoup
16
- from transformers import LlavaProcessor, LlavaForConditionalGeneration, TextIteratorStreamer
 
 
17
  from huggingface_hub import InferenceClient
18
  from PIL import Image
19
  import spaces
@@ -25,14 +27,10 @@ import json
25
  from gradio_client import Client, file
26
  from groq import Groq
27
 
28
- # You can also use models that are commented below
29
- # model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
30
- model_id = "llava-hf/llava-interleave-qwen-7b-hf"
31
- # model_id = "llava-hf/llava-interleave-qwen-7b-dpo-hf"
32
- processor = LlavaProcessor.from_pretrained(model_id)
33
- model = LlavaForConditionalGeneration.from_pretrained(model_id,torch_dtype=torch.float16, use_flash_attention_2=True)
34
- model.to("cuda")
35
- # Credit to merve for code of llava interleave qwen
36
 
37
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", None)
38
 
@@ -172,39 +170,69 @@ def video_gen(prompt):
172
  client = Client("KingNish/Instant-Video")
173
  return client.predict(prompt, api_name="/instant_video")
174
 
175
- def llava(user_prompt, chat_history):
 
 
 
 
 
 
 
176
  if user_prompt["files"]:
177
- image = user_prompt["files"][0]
178
  else:
179
  for hist in chat_history:
180
- if type(hist[0])==tuple:
181
- image = hist[0][0]
182
-
183
- txt = user_prompt["text"]
184
- img = user_prompt["files"]
185
-
186
- video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
187
- image_extensions = Image.registered_extensions()
188
- image_extensions = tuple([ex for ex, f in image_extensions.items()])
189
-
190
- if image.endswith(video_extensions):
191
- image = sample_frames(image)
192
- gr.Info("Analyzing Video")
193
- image_tokens = "<image>" * int(len(image))
194
- prompt = f"<|im_start|>user {image_tokens}\n{user_prompt}<|im_end|><|im_start|>assistant"
195
-
196
- elif image.endswith(image_extensions):
197
- image = Image.open(image).convert("RGB")
198
- gr.Info("Analyzing image")
199
- prompt = f"<|im_start|>user <image>\n{user_prompt}<|im_end|><|im_start|>assistant"
200
-
201
- system_llava = "<|im_start|>system\nYou are OpenGPT 4o, an exceptionally capable and versatile AI assistant made by KingNish. Your task is to fulfill users query in best possible way. You are provided with image, videos and 3d structures as input with question your task is to give best possible detailed results to user according to their query. Reply the question asked by user properly and best possible way.<|im_end|>"
202
-
203
- final_prompt = f"{system_llava}\n{prompt}"
204
-
205
- inputs = processor(final_prompt, image, return_tensors="pt").to("cuda", torch.float16)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
- return inputs
208
 
209
  # Initialize inference clients for different models
210
  client_mistral = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
@@ -215,9 +243,7 @@ client_mistral_nemo = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
215
  @spaces.GPU(duration=60, queue=False)
216
  def model_inference( user_prompt, chat_history):
217
  if user_prompt["files"]:
218
- inputs = llava(user_prompt, chat_history)
219
- streamer = TextIteratorStreamer(processor, skip_prompt=True, **{"skip_special_tokens": True})
220
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
221
 
222
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
223
  thread.start()
 
4
  import random
5
  from threading import Thread
6
  from typing import List, Dict, Union
7
+ # import subprocess
8
+ # subprocess.run(
9
+ # "pip install flash-attn --no-build-isolation",
10
+ # env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
11
+ # shell=True,
12
+ # )
13
  import torch
14
  import gradio as gr
15
  from bs4 import BeautifulSoup
16
+ from transformers import LlavaProcessor, LlavaForConditionalGeneration,
17
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
18
+ from qwen_vl_utils import process_vision_info
19
  from huggingface_hub import InferenceClient
20
  from PIL import Image
21
  import spaces
 
27
  from gradio_client import Client, file
28
  from groq import Groq
29
 
30
+ # Model and Processor Loading (Done once at startup)
31
+ MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
32
+ model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16).to("cuda").eval()
33
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 
 
 
34
 
35
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", None)
36
 
 
170
  client = Client("KingNish/Instant-Video")
171
  return client.predict(prompt, api_name="/instant_video")
172
 
173
+ image_extensions = Image.registered_extensions()
174
+ video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
175
+
176
+ def qwen_inference(user_prompt, chat_history):
177
+ images = []
178
+ text_input = user_prompt["text"]
179
+
180
+ # Handle multiple image uploads
181
  if user_prompt["files"]:
182
+ images.extend(user_prompt["files"])
183
  else:
184
  for hist in chat_history:
185
+ if type(hist[0]) == tuple:
186
+ images.extend(hist[0])
187
+
188
+ # System Prompt (Similar to LLaVA)
189
+ SYSTEM_PROMPT = "You are OpenGPT 4o, an exceptionally capable and versatile AI assistant made by KingNish. Your task is to fulfill users query in best possible way. You are provided with image, videos and 3d structures as input with question your task is to give best possible detailed results to user according to their query. Reply the question asked by user properly and best possible way."
190
+
191
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
192
+
193
+ for image in images:
194
+ if image.endswith(video_extensions):
195
+ messages.append({
196
+ "role": "user",
197
+ "content": [
198
+ {"type": "video", "video": image},
199
+ ]
200
+ })
201
+
202
+ if image.endswith(tuple([i for i, f in image_extensions.items()])):
203
+ messages.append({
204
+ "role": "user",
205
+ "content": [
206
+ {"type": "image", "image": image},
207
+ ]
208
+ })
209
+
210
+ # Add user text input
211
+ messages.append({
212
+ "role": "user",
213
+ "content": [
214
+ {"type": "text", "text": text_input}
215
+ ]
216
+ })
217
+
218
+ text = processor.apply_chat_template(
219
+ messages, tokenize=False, add_generation_prompt=True
220
+ )
221
+ image_inputs, video_inputs = process_vision_info(messages)
222
+ inputs = processor(
223
+ text=[text],
224
+ images=image_inputs,
225
+ videos=video_inputs,
226
+ padding=True,
227
+ return_tensors="pt",
228
+ ).to("cuda")
229
+
230
+ streamer = TextIteratorStreamer(
231
+ processor, skip_prompt=True, **{"skip_special_tokens": True}
232
+ )
233
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
234
 
235
+ return generation_kwargs
236
 
237
  # Initialize inference clients for different models
238
  client_mistral = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
 
243
  @spaces.GPU(duration=60, queue=False)
244
  def model_inference( user_prompt, chat_history):
245
  if user_prompt["files"]:
246
+ generation_kwargs = qwen_inference(user_prompt, chat_history)
 
 
247
 
248
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
249
  thread.start()