Spaces:
Running
Running
File size: 18,042 Bytes
fc498e0 c59fc6b 0b430a0 c59fc6b 61e10b7 9f7ab84 d26dd8d 24fe4cb 61e10b7 f958c4b c59fc6b 64de28a 61e10b7 fc498e0 61e10b7 d72aea6 61e10b7 c59fc6b fc498e0 c0e58be e59b1eb 502ab4a fc498e0 502ab4a d72aea6 c59fc6b 61e10b7 fc498e0 61e10b7 fc498e0 c59fc6b fc498e0 61e10b7 fc498e0 c59fc6b fc498e0 61e10b7 fc498e0 61e10b7 fc498e0 c59fc6b 4d96ac5 fc498e0 c59fc6b fc498e0 61e10b7 fc498e0 e57843e fc498e0 e57843e c59fc6b fc498e0 c59fc6b fc498e0 c59fc6b fc498e0 d148f27 fc498e0 c59fc6b 61e10b7 fc498e0 61e10b7 fc498e0 c59fc6b fc498e0 f8ba7cc c59fc6b fc498e0 c59fc6b 4c70c9c 61e10b7 fc498e0 61e10b7 fc498e0 61e10b7 fc498e0 61347b1 33ddd8a 1cdf777 fc498e0 33ddd8a 1cdf777 33ddd8a e21af99 1cdf777 e21af99 fc498e0 5b25ca3 fc498e0 61347b1 7ea3839 61347b1 e21af99 61347b1 fc498e0 61347b1 c59fc6b fc498e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 |
# Main script for KBVQA: Knowledge-Based Visual Question Answering Module
# This module is the central component for implementing the designed model architecture for the Knowledge-Based Visual
# Question Answering (KB-VQA) project. It integrates various sub-modules, including image captioning, object detection,
# and a fine-tuned language model, to provide a comprehensive solution for answering questions based on visual input.
# --- Description ---
# **KBVQA class**:
# The KBVQA class encapsulates the functionality needed to perform visual question answering using a combination of
# multimodal models.
# The class handles the following tasks:
# - Loading and managing a fine-tuned language model (LLaMA-2) for question answering.
# - Integrating an image captioning model to generate descriptive captions for input images.
# - Utilizing an object detection model to identify and describe objects within the images.
# - Formatting and generating prompts for the language model based on the image captions and detected objects.
# - Providing methods to analyze images and generate answers to user-provided questions.
# **prepare_kbvqa_model function**:
# - The prepare_kbvqa_model function orchestrates the loading and initialization of the KBVQA class, ensuring it is
# ready for inference.
# ---Instructions---
# **Model Preparation**:
# Use the prepare_kbvqa_model function to prepare and initialize the KBVQA system, ensuring all required models are
# loaded and ready for use.
# **Image Processing and Question Answering**:
# Use the get_caption method to generate captions for input images.
# Use the detect_objects method to identify and describe objects in the images.
# Use the generate_answer method to answer questions based on the image captions and detected objects.
# This module forms the backbone of the KB-VQA project, integrating advanced models to provide an end-to-end solution
# for visual question answering tasks.
# Ensure all dependencies are installed and the required configuration file is in place before running this script.
# The configurations for the KBVQA class are defined in the 'my_model/config/kbvqa_config.py' file.
# ---------- Please run this module to utilize the full KB-VQA functionality ----------#
# ---------- Please ensure this is run on a GPU ----------#
import streamlit as st
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from typing import Tuple, Optional
from my_model.utilities.gen_utilities import free_gpu_resources
from my_model.captioner.image_captioning import ImageCaptioningModel
from my_model.detector.object_detection import ObjectDetector
import my_model.config.kbvqa_config as config
class KBVQA:
"""
The KBVQA class encapsulates the functionality for the Knowledge-Based Visual Question Answering (KBVQA) model.
It integrates various components such as an image captioning model, object detection model, and a fine-tuned
language model (LLAMA2) on OK-VQA dataset for generating answers to visual questions.
Attributes:
kbvqa_model_name (str): Name of the fine-tuned language model used for KBVQA.
quantization (str): The quantization setting for the model (e.g., '4bit', '8bit').
max_context_window (int): The maximum number of tokens allowed in the model's context window.
add_eos_token (bool): Flag to indicate whether to add an end-of-sentence token to the tokenizer.
trust_remote (bool): Flag to indicate whether to trust remote code when using the tokenizer.
use_fast (bool): Flag to indicate whether to use the fast version of the tokenizer.
low_cpu_mem_usage (bool): Flag to optimize model loading for low CPU memory usage.
kbvqa_tokenizer (Optional[AutoTokenizer]): The tokenizer for the KBVQA model.
captioner (Optional[ImageCaptioningModel]): The model used for generating image captions.
detector (Optional[ObjectDetector]): The object detection model.
detection_model (Optional[str]): The name of the object detection model.
detection_confidence (Optional[float]): The confidence threshold for object detection.
kbvqa_model (Optional[AutoModelForCausalLM]): The fine-tuned language model for KBVQA.
bnb_config (BitsAndBytesConfig): Configuration for BitsAndBytes optimized model.
access_token (str): Access token for Hugging Face API.
current_prompt_length (int): Prompt length.
Methods:
create_bnb_config: Creates a BitsAndBytes configuration based on the quantization setting.
load_caption_model: Loads the image captioning model.
get_caption: Generates a caption for a given image.
load_detector: Loads the object detection model.
detect_objects: Detects objects in a given image.
load_fine_tuned_model: Loads the fine-tuned KBVQA model along with its tokenizer.
all_models_loaded: Checks if all the required models are loaded.
force_reload_model: Forces a reload of all models, freeing up GPU resources.
format_prompt: Formats the prompt for the KBVQA model.
generate_answer: Generates an answer to a given question using the KBVQA model.
"""
def __init__(self) -> None:
"""
Initializes the KBVQA instance with configuration parameters.
"""
if st.session_state["method"] == "7b-Fine-Tuned Model":
self.kbvqa_model_name: str = config.KBVQA_MODEL_NAME_7b
elif st.session_state["method"] == "13b-Fine-Tuned Model":
self.kbvqa_model_name: str = config.KBVQA_MODEL_NAME_13b
self.quantization: str = config.QUANTIZATION
self.max_context_window: int = config.MAX_CONTEXT_WINDOW # set to 4,000 tokens
self.add_eos_token: bool = config.ADD_EOS_TOKEN
self.trust_remote: bool = config.TRUST_REMOTE
self.use_fast: bool = config.USE_FAST
self.low_cpu_mem_usage: bool = config.LOW_CPU_MEM_USAGE
self.kbvqa_tokenizer: Optional[AutoTokenizer] = None
self.captioner: Optional[ImageCaptioningModel] = None
self.detector: Optional[ObjectDetector] = None
self.detection_model: Optional[str] = None
self.detection_confidence: Optional[float] = None
self.kbvqa_model: Optional[AutoModelForCausalLM] = None
self.bnb_config: BitsAndBytesConfig = self.create_bnb_config()
self.access_token: str = config.HUGGINGFACE_TOKEN
self.current_prompt_length = None
def create_bnb_config(self) -> BitsAndBytesConfig:
"""
Creates a BitsAndBytes configuration based on the quantization setting.
Returns:
BitsAndBytesConfig: Configuration for BitsAndBytes optimized model.
"""
if self.quantization == '4bit':
return BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
elif self.quantization == '8bit':
return BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_use_double_quant=True,
bnb_8bit_quant_type="nf4",
bnb_8bit_compute_dtype=torch.bfloat16
)
def load_caption_model(self) -> None:
"""
Loads the image captioning model into the KBVQA instance.
Returns:
None
"""
self.captioner = ImageCaptioningModel()
self.captioner.load_model()
free_gpu_resources()
def get_caption(self, img: Image.Image) -> str:
"""
Generates a caption for a given image using the image captioning model.
Args:
img (PIL.Image.Image): The image for which to generate a caption.
Returns:
str: The generated caption for the image.
"""
caption = self.captioner.generate_caption(img)
free_gpu_resources()
return caption
def load_detector(self, model: str) -> None:
"""
Loads the object detection model.
Args:
model (str): The name of the object detection model to load.
Returns:
None
"""
self.detector = ObjectDetector()
self.detector.load_model(model)
free_gpu_resources()
def detect_objects(self, img: Image.Image) -> Tuple[Image.Image, str]:
"""
Detects objects in a given image using the loaded object detection model.
Args:
img (PIL.Image.Image): The image in which to detect objects.
Returns:
tuple: A tuple containing the image with detected objects drawn and a string representation of detected objects.
"""
image = self.detector.process_image(img)
free_gpu_resources()
detected_objects_string, detected_objects_list = self.detector.detect_objects(image, threshold=st.session_state[
'confidence_level'])
free_gpu_resources()
image_with_boxes = self.detector.draw_boxes(img, detected_objects_list)
free_gpu_resources()
return image_with_boxes, detected_objects_string
def load_fine_tuned_model(self) -> None:
"""
Loads the fine-tuned KBVQA model along with its tokenizer.
Returns:
None
"""
self.kbvqa_model = AutoModelForCausalLM.from_pretrained(self.kbvqa_model_name,
device_map="auto",
low_cpu_mem_usage=True,
quantization_config=self.bnb_config,
token=self.access_token)
free_gpu_resources()
self.kbvqa_tokenizer = AutoTokenizer.from_pretrained(self.kbvqa_model_name,
use_fast=self.use_fast,
low_cpu_mem_usage=True,
trust_remote_code=self.trust_remote,
add_eos_token=self.add_eos_token,
token=self.access_token)
free_gpu_resources()
@property
def all_models_loaded(self) -> bool:
"""
Checks if all the required models (KBVQA, captioner, detector) are loaded.
Returns:
bool: True if all models are loaded, False otherwise.
"""
return self.kbvqa_model is not None and self.captioner is not None and self.detector is not None
def format_prompt(self, current_query: str, history: Optional[str] = None, sys_prompt: Optional[str] = None,
caption: str = None, objects: Optional[str] = None) -> str:
"""
Formats the prompt for the KBVQA model based on the provided parameters.
This implements the Prompt Engineering Module of the Overall KB-VQA Archetecture.
Args:
current_query (str): The current question to be answered.
history (str, optional): The history of previous interactions.
sys_prompt (str, optional): The system prompt or instructions for the model.
caption (str, optional): The caption of the image.
objects (str, optional): The detected objects in the image.
Returns:
str: The formatted prompt for the KBVQA model.
"""
# These are the special tokens designed for the model to be fine-tuned on.
B_CAP = '[CAP]'
E_CAP = '[/CAP]'
B_QES = '[QES]'
E_QES = '[/QES]'
B_OBJ = '[OBJ]'
E_OBJ = '[/OBJ]'
# These are the default special tokens of LLaMA-2 Chat Model.
B_SENT = '<s>'
E_SENT = '</s>'
B_INST = '[INST]'
E_INST = '[/INST]'
B_SYS = '<<SYS>>\n'
E_SYS = '\n<</SYS>>\n\n'
current_query = current_query.strip()
if sys_prompt is None:
sys_prompt = config.SYSTEM_PROMPT.strip()
# History can be used to facilitate multi turn chat, not used for the Run Inference tool within the demo app.
if history is None:
if objects is None:
p = f"""{B_SENT}{B_INST} {B_SYS}{sys_prompt}{E_SYS}{B_CAP}{caption}{E_CAP}{B_QES}{current_query}{E_QES}{E_INST}"""
else:
p = f"""{B_SENT}{B_INST} {B_SYS}{sys_prompt}{E_SYS}{B_CAP}{caption}{E_CAP}{B_OBJ}{objects}{E_OBJ}{B_QES}taking into consideration the objects with high certainty, {current_query}{E_QES}{E_INST}"""
else:
p = f"""{history}\n{B_SENT}{B_INST} {B_QES}{current_query}{E_QES}{E_INST}"""
return p
@staticmethod
def trim_objects(detected_objects_str: str) -> str:
"""
Trim the last object from the detected objects string.
This is implemented to ensure that the prompt length is within the context window, threshold set to 4,000 tokens.
Args:
detected_objects_str (str): String containing detected objects.
Returns:
str: The string with the last object removed.
"""
objects = detected_objects_str.strip().split("\n")
if len(objects) >= 1:
return "\n".join(objects[:-1])
return ""
def generate_answer(self, question: str, caption: str, detected_objects_str: str) -> str:
"""
Generates an answer to a given question using the KBVQA model.
Args:
question (str): The question to be answered.
caption (str): The caption of the image related to the question.
detected_objects_str (str): The string representation of detected objects in the image.
Returns:
str: The generated answer to the question.
"""
free_gpu_resources()
prompt = self.format_prompt(question, caption=caption, objects=detected_objects_str)
num_tokens = len(self.kbvqa_tokenizer.tokenize(prompt))
self.current_prompt_length = num_tokens
trim = False # flag used to check if prompt trim is required or no.
# max_context_window is set to 4,000 tokens, refer to the config file.
if self.current_prompt_length > self.max_context_window:
trim = True
st.warning(
f"Prompt length is {self.current_prompt_length} which is larger than the maximum context window of LLaMA-2,"
f" objects detected with low confidence will be removed one at a time until the prompt length is within the"
f" maximum context window ...")
# an object is trimmed from the bottom of the list until the overall prompt length is within the context window.
while self.current_prompt_length > self.max_context_window:
detected_objects_str = self.trim_objects(detected_objects_str)
prompt = self.format_prompt(question, caption=caption, objects=detected_objects_str)
self.current_prompt_length = len(self.kbvqa_tokenizer.tokenize(prompt))
if detected_objects_str == "":
break # Break if no objects are left
if trim:
st.warning(f"New prompt length is: {self.current_prompt_length}")
trim = False
model_inputs = self.kbvqa_tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to('cuda')
free_gpu_resources()
input_ids = model_inputs["input_ids"]
output_ids = self.kbvqa_model.generate(input_ids)
free_gpu_resources()
index = input_ids.shape[1] # needed to avoid printing the input prompt
history = self.kbvqa_tokenizer.decode(output_ids[0], skip_special_tokens=False)
output_text = self.kbvqa_tokenizer.decode(output_ids[0][index:], skip_special_tokens=True)
return output_text.capitalize()
def prepare_kbvqa_model(only_reload_detection_model: bool = False, force_reload: bool = False) -> KBVQA:
"""
Prepares the KBVQA model for use, including loading necessary sub-models.
This serves as the main function for loading and reloading the KB-VQA model.
Args:
only_reload_detection_model (bool): If True, only the object detection model is reloaded.
force_reload (bool): If True, forces the reload of all models.
Returns:
KBVQA: An instance of the KBVQA model ready for inference.
"""
if force_reload:
free_gpu_resources()
loading_message = 'Reloading model.. this should take no more than 2 or 3 minutes!'
try:
del st.session_state['kbvqa']
free_gpu_resources()
free_gpu_resources()
except:
free_gpu_resources()
free_gpu_resources()
pass
free_gpu_resources()
else:
loading_message = 'Looading model.. this should take no more than 2 or 3 minutes!'
free_gpu_resources()
kbvqa = KBVQA()
kbvqa.detection_model = st.session_state.detection_model
# Progress bar for model loading
with st.spinner(loading_message):
if not only_reload_detection_model:
progress_bar = st.progress(0)
kbvqa.load_detector(kbvqa.detection_model)
progress_bar.progress(33)
kbvqa.load_caption_model()
free_gpu_resources()
progress_bar.progress(75)
st.text('Almost there :)')
kbvqa.load_fine_tuned_model()
free_gpu_resources()
progress_bar.progress(100)
else:
free_gpu_resources()
progress_bar = st.progress(0)
kbvqa.load_detector(kbvqa.detection_model)
progress_bar.progress(100)
if kbvqa.all_models_loaded:
st.success('Model loaded successfully and ready for inferecne!')
kbvqa.kbvqa_model.eval()
free_gpu_resources()
return kbvqa
if __name__ == "__main__":
pass
#### Example on how to use the module ####
# Prepare the KBVQA model
# kbvqa = prepare_kbvqa_model()
# Load an image
# image = Image.open('path_to_image.jpg')
# Generate a caption for the image
# caption = kbvqa.get_caption(image)
# Detect objects in the image
# image_with_boxes, detected_objects_str = kbvqa.detect_objects(image)
# Generate an answer to a question about the image
# question = "What is the object in the image?"
# answer = kbvqa.generate_answer(question, caption, detected_objects_str)
# print(f"Answer: {answer}")
|