KB-VQA

Sleeping

App Files Files Community

m7mdal7aj commited on May 17

Commit

a97003a

•

1 Parent(s): d8f32a4

Update my_model/KBVQA.py

Browse files

Files changed (1) hide show

my_model/KBVQA.py +240 -240

my_model/KBVQA.py CHANGED Viewed

@@ -112,248 +112,248 @@ class KBVQA:
         self.current_prompt_length = None
-def create_bnb_config(self) -> BitsAndBytesConfig:
-    """
-    Creates a BitsAndBytes configuration based on the quantization setting.
-    Returns:
-        BitsAndBytesConfig: Configuration for BitsAndBytes optimized model.
-    """
-    if self.quantization == '4bit':
-        return BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.bfloat16
-        )
-    elif self.quantization == '8bit':
-        return BitsAndBytesConfig(
-            load_in_8bit=True,
-            bnb_8bit_use_double_quant=True,
-            bnb_8bit_quant_type="nf4",
-            bnb_8bit_compute_dtype=torch.bfloat16
-        )
-def load_caption_model(self) -> None:
-    """
-    Loads the image captioning model into the KBVQA instance.
-    Returns:
-        None
-    """
-    self.captioner = ImageCaptioningModel()
-    self.captioner.load_model()
-    free_gpu_resources()
-def get_caption(self, img: Image.Image) -> str:
-    """
-    Generates a caption for a given image using the image captioning model.
-    Args:
-        img (PIL.Image.Image): The image for which to generate a caption.
-    Returns:
-        str: The generated caption for the image.
-    """
-    caption = self.captioner.generate_caption(img)
-    free_gpu_resources()
-    return caption
-def load_detector(self, model: str) -> None:
-    """
-    Loads the object detection model.
-    Args:
-        model (str): The name of the object detection model to load.
-    Returns:
-        None
-    """
-    self.detector = ObjectDetector()
-    self.detector.load_model(model)
-    free_gpu_resources()
-def detect_objects(self, img: Image.Image) -> Tuple[Image.Image, str]:
-    """
-    Detects objects in a given image using the loaded object detection model.
-    Args:
-        img (PIL.Image.Image): The image in which to detect objects.
-    Returns:
-        tuple: A tuple containing the image with detected objects drawn and a string representation of detected objects.
-    """
-    image = self.detector.process_image(img)
-    free_gpu_resources()
-    detected_objects_string, detected_objects_list = self.detector.detect_objects(image, threshold=st.session_state[
-        'confidence_level'])
-    free_gpu_resources()
-    image_with_boxes = self.detector.draw_boxes(img, detected_objects_list)
-    free_gpu_resources()
-    return image_with_boxes, detected_objects_string
-def load_fine_tuned_model(self) -> None:
-    """
-    Loads the fine-tuned KBVQA model along with its tokenizer.
-    Returns:
-        None
-    """
-    self.kbvqa_model = AutoModelForCausalLM.from_pretrained(self.kbvqa_model_name,
-                                                            device_map="auto",
-                                                            low_cpu_mem_usage=True,
-                                                            quantization_config=self.bnb_config,
-                                                            token=self.access_token)
-    free_gpu_resources()
-    self.kbvqa_tokenizer = AutoTokenizer.from_pretrained(self.kbvqa_model_name,
-                                                         use_fast=self.use_fast,
-                                                         low_cpu_mem_usage=True,
-                                                         trust_remote_code=self.trust_remote,
-                                                         add_eos_token=self.add_eos_token,
-                                                         token=self.access_token)
-    free_gpu_resources()
-@property
-def all_models_loaded(self) -> bool:
-    """
-    Checks if all the required models (KBVQA, captioner, detector) are loaded.
-    Returns:
-        bool: True if all models are loaded, False otherwise.
-    """
-    return self.kbvqa_model is not None and self.captioner is not None and self.detector is not None
-def format_prompt(self, current_query: str, history: Optional[str] = None, sys_prompt: Optional[str] = None,
-                  caption: str = None, objects: Optional[str] = None) -> str:
-    """
-    Formats the prompt for the KBVQA model based on the provided parameters.
-    This implements the Prompt Engineering Module of the Overall KB-VQA Archetecture.
-    Args:
-        current_query (str): The current question to be answered.
-        history (str, optional): The history of previous interactions.
-        sys_prompt (str, optional): The system prompt or instructions for the model.
-        caption (str, optional): The caption of the image.
-        objects (str, optional): The detected objects in the image.
-    Returns:
-        str: The formatted prompt for the KBVQA model.
-    """
-    # These are the special tokens designed for the model to be fine-tuned on.
-    B_CAP = '[CAP]'
-    E_CAP = '[/CAP]'
-    B_QES = '[QES]'
-    E_QES = '[/QES]'
-    B_OBJ = '[OBJ]'
-    E_OBJ = '[/OBJ]'
-    # These are the default special tokens of LLaMA-2 Chat Model.
-    B_SENT = '<s>'
-    E_SENT = '</s>'
-    B_INST = '[INST]'
-    E_INST = '[/INST]'
-    B_SYS = '<<SYS>>\n'
-    E_SYS = '\n<</SYS>>\n\n'
-    current_query = current_query.strip()
-    if sys_prompt is None:
-        sys_prompt = config.SYSTEM_PROMPT.strip()
-    # History can be used to facilitate multi turn chat, not used for the Run Inference tool within the demo app.
-    if history is None:
-        if objects is None:
-            p = f"""{B_SENT}{B_INST} {B_SYS}{sys_prompt}{E_SYS}{B_CAP}{caption}{E_CAP}{B_QES}{current_query}{E_QES}{E_INST}"""
         else:
-            p = f"""{B_SENT}{B_INST} {B_SYS}{sys_prompt}{E_SYS}{B_CAP}{caption}{E_CAP}{B_OBJ}{objects}{E_OBJ}{B_QES}taking into consideration the objects with high certainty, {current_query}{E_QES}{E_INST}"""
-    else:
-        p = f"""{history}\n{B_SENT}{B_INST} {B_QES}{current_query}{E_QES}{E_INST}"""
-    return p
-@staticmethod
-def trim_objects(detected_objects_str: str) -> str:
-    """
-    Trim the last object from the detected objects string.
-    This is implemented to ensure that the prompt length is within the context window, threshold set to 4,000 tokens.
-    Args:
-        detected_objects_str (str): String containing detected objects.
-    Returns:
-        str: The string with the last object removed.
-    """
-    objects = detected_objects_str.strip().split("\n")
-    if len(objects) >= 1:
-        return "\n".join(objects[:-1])
-    return ""
-def generate_answer(self, question: str, caption: str, detected_objects_str: str) -> str:
-    """
-    Generates an answer to a given question using the KBVQA model.
-    Args:
-        question (str): The question to be answered.
-        caption (str): The caption of the image related to the question.
-        detected_objects_str (str): The string representation of detected objects in the image.
-    Returns:
-        str: The generated answer to the question.
-    """
-    free_gpu_resources()
-    prompt = self.format_prompt(question, caption=caption, objects=detected_objects_str)
-    num_tokens = len(self.kbvqa_tokenizer.tokenize(prompt))
-    self.current_prompt_length = num_tokens
-    trim = False  # flag used to check if prompt trim is required or no.
-    # max_context_window is set to 4,000 tokens, refer to the config file.
-    if self.current_prompt_length > self.max_context_window:
-        trim = True
-        st.warning(
-            f"Prompt length is {self.current_prompt_length} which is larger than the maximum context window of LLaMA-2,"
-            f" objects detected with low confidence will be removed one at a time until the prompt length is within the"
-            f" maximum context window ...")
-    # an object is trimmed from the bottom of the list until the overall prompt length is within the context window.
-    while self.current_prompt_length > self.max_context_window:
-        detected_objects_str = self.trim_objects(detected_objects_str)
         prompt = self.format_prompt(question, caption=caption, objects=detected_objects_str)
-        self.current_prompt_length = len(self.kbvqa_tokenizer.tokenize(prompt))
-        if detected_objects_str == "":
-            break  # Break if no objects are left
-    if trim:
-        st.warning(f"New prompt length is: {self.current_prompt_length}")
-        trim = False
-    model_inputs = self.kbvqa_tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to('cuda')
-    free_gpu_resources()
-    input_ids = model_inputs["input_ids"]
-    output_ids = self.kbvqa_model.generate(input_ids)
-    free_gpu_resources()
-    index = input_ids.shape[1]  # needed to avoid printing the input prompt
-    history = self.kbvqa_tokenizer.decode(output_ids[0], skip_special_tokens=False)
-    output_text = self.kbvqa_tokenizer.decode(output_ids[0][index:], skip_special_tokens=True)
-    return output_text.capitalize()
 def prepare_kbvqa_model(only_reload_detection_model: bool = False, force_reload: bool = False) -> KBVQA:
     """

         self.current_prompt_length = None
+    def create_bnb_config(self) -> BitsAndBytesConfig:
+        """
+        Creates a BitsAndBytes configuration based on the quantization setting.
+        Returns:
+            BitsAndBytesConfig: Configuration for BitsAndBytes optimized model.
+        """
+        if self.quantization == '4bit':
+            return BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16
+            )
+        elif self.quantization == '8bit':
+            return BitsAndBytesConfig(
+                load_in_8bit=True,
+                bnb_8bit_use_double_quant=True,
+                bnb_8bit_quant_type="nf4",
+                bnb_8bit_compute_dtype=torch.bfloat16
+            )
+    def load_caption_model(self) -> None:
+        """
+        Loads the image captioning model into the KBVQA instance.
+        Returns:
+            None
+        """
+        self.captioner = ImageCaptioningModel()
+        self.captioner.load_model()
+        free_gpu_resources()
+    def get_caption(self, img: Image.Image) -> str:
+        """
+        Generates a caption for a given image using the image captioning model.
+        Args:
+            img (PIL.Image.Image): The image for which to generate a caption.
+        Returns:
+            str: The generated caption for the image.
+        """
+        caption = self.captioner.generate_caption(img)
+        free_gpu_resources()
+        return caption
+    def load_detector(self, model: str) -> None:
+        """
+        Loads the object detection model.
+        Args:
+            model (str): The name of the object detection model to load.
+        Returns:
+            None
+        """
+        self.detector = ObjectDetector()
+        self.detector.load_model(model)
+        free_gpu_resources()
+    def detect_objects(self, img: Image.Image) -> Tuple[Image.Image, str]:
+        """
+        Detects objects in a given image using the loaded object detection model.
+        Args:
+            img (PIL.Image.Image): The image in which to detect objects.
+        Returns:
+            tuple: A tuple containing the image with detected objects drawn and a string representation of detected objects.
+        """
+        image = self.detector.process_image(img)
+        free_gpu_resources()
+        detected_objects_string, detected_objects_list = self.detector.detect_objects(image, threshold=st.session_state[
+            'confidence_level'])
+        free_gpu_resources()
+        image_with_boxes = self.detector.draw_boxes(img, detected_objects_list)
+        free_gpu_resources()
+        return image_with_boxes, detected_objects_string
+    def load_fine_tuned_model(self) -> None:
+        """
+        Loads the fine-tuned KBVQA model along with its tokenizer.
+        Returns:
+            None
+        """
+        self.kbvqa_model = AutoModelForCausalLM.from_pretrained(self.kbvqa_model_name,
+                                                                device_map="auto",
+                                                                low_cpu_mem_usage=True,
+                                                                quantization_config=self.bnb_config,
+                                                                token=self.access_token)
+        free_gpu_resources()
+        self.kbvqa_tokenizer = AutoTokenizer.from_pretrained(self.kbvqa_model_name,
+                                                             use_fast=self.use_fast,
+                                                             low_cpu_mem_usage=True,
+                                                             trust_remote_code=self.trust_remote,
+                                                             add_eos_token=self.add_eos_token,
+                                                             token=self.access_token)
+        free_gpu_resources()
+    @property
+    def all_models_loaded(self) -> bool:
+        """
+        Checks if all the required models (KBVQA, captioner, detector) are loaded.
+        Returns:
+            bool: True if all models are loaded, False otherwise.
+        """
+        return self.kbvqa_model is not None and self.captioner is not None and self.detector is not None
+    def format_prompt(self, current_query: str, history: Optional[str] = None, sys_prompt: Optional[str] = None,
+                      caption: str = None, objects: Optional[str] = None) -> str:
+        """
+        Formats the prompt for the KBVQA model based on the provided parameters.
+        This implements the Prompt Engineering Module of the Overall KB-VQA Archetecture.
+        Args:
+            current_query (str): The current question to be answered.
+            history (str, optional): The history of previous interactions.
+            sys_prompt (str, optional): The system prompt or instructions for the model.
+            caption (str, optional): The caption of the image.
+            objects (str, optional): The detected objects in the image.
+        Returns:
+            str: The formatted prompt for the KBVQA model.
+        """
+        # These are the special tokens designed for the model to be fine-tuned on.
+        B_CAP = '[CAP]'
+        E_CAP = '[/CAP]'
+        B_QES = '[QES]'
+        E_QES = '[/QES]'
+        B_OBJ = '[OBJ]'
+        E_OBJ = '[/OBJ]'
+        # These are the default special tokens of LLaMA-2 Chat Model.
+        B_SENT = '<s>'
+        E_SENT = '</s>'
+        B_INST = '[INST]'
+        E_INST = '[/INST]'
+        B_SYS = '<<SYS>>\n'
+        E_SYS = '\n<</SYS>>\n\n'
+        current_query = current_query.strip()
+        if sys_prompt is None:
+            sys_prompt = config.SYSTEM_PROMPT.strip()
+        # History can be used to facilitate multi turn chat, not used for the Run Inference tool within the demo app.
+        if history is None:
+            if objects is None:
+                p = f"""{B_SENT}{B_INST} {B_SYS}{sys_prompt}{E_SYS}{B_CAP}{caption}{E_CAP}{B_QES}{current_query}{E_QES}{E_INST}"""
+            else:
+                p = f"""{B_SENT}{B_INST} {B_SYS}{sys_prompt}{E_SYS}{B_CAP}{caption}{E_CAP}{B_OBJ}{objects}{E_OBJ}{B_QES}taking into consideration the objects with high certainty, {current_query}{E_QES}{E_INST}"""
         else:
+            p = f"""{history}\n{B_SENT}{B_INST} {B_QES}{current_query}{E_QES}{E_INST}"""
+        return p
+    @staticmethod
+    def trim_objects(detected_objects_str: str) -> str:
+        """
+        Trim the last object from the detected objects string.
+        This is implemented to ensure that the prompt length is within the context window, threshold set to 4,000 tokens.
+        Args:
+            detected_objects_str (str): String containing detected objects.
+        Returns:
+            str: The string with the last object removed.
+        """
+        objects = detected_objects_str.strip().split("\n")
+        if len(objects) >= 1:
+            return "\n".join(objects[:-1])
+        return ""
+    def generate_answer(self, question: str, caption: str, detected_objects_str: str) -> str:
+        """
+        Generates an answer to a given question using the KBVQA model.
+        Args:
+            question (str): The question to be answered.
+            caption (str): The caption of the image related to the question.
+            detected_objects_str (str): The string representation of detected objects in the image.
+        Returns:
+            str: The generated answer to the question.
+        """
+        free_gpu_resources()
         prompt = self.format_prompt(question, caption=caption, objects=detected_objects_str)
+        num_tokens = len(self.kbvqa_tokenizer.tokenize(prompt))
+        self.current_prompt_length = num_tokens
+        trim = False  # flag used to check if prompt trim is required or no.
+        # max_context_window is set to 4,000 tokens, refer to the config file.
+        if self.current_prompt_length > self.max_context_window:
+            trim = True
+            st.warning(
+                f"Prompt length is {self.current_prompt_length} which is larger than the maximum context window of LLaMA-2,"
+                f" objects detected with low confidence will be removed one at a time until the prompt length is within the"
+                f" maximum context window ...")
+        # an object is trimmed from the bottom of the list until the overall prompt length is within the context window.
+        while self.current_prompt_length > self.max_context_window:
+            detected_objects_str = self.trim_objects(detected_objects_str)
+            prompt = self.format_prompt(question, caption=caption, objects=detected_objects_str)
+            self.current_prompt_length = len(self.kbvqa_tokenizer.tokenize(prompt))
+            if detected_objects_str == "":
+                break  # Break if no objects are left
+        if trim:
+            st.warning(f"New prompt length is: {self.current_prompt_length}")
+            trim = False
+        model_inputs = self.kbvqa_tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to('cuda')
+        free_gpu_resources()
+        input_ids = model_inputs["input_ids"]
+        output_ids = self.kbvqa_model.generate(input_ids)
+        free_gpu_resources()
+        index = input_ids.shape[1]  # needed to avoid printing the input prompt
+        history = self.kbvqa_tokenizer.decode(output_ids[0], skip_special_tokens=False)
+        output_text = self.kbvqa_tokenizer.decode(output_ids[0][index:], skip_special_tokens=True)
+        return output_text.capitalize()
 def prepare_kbvqa_model(only_reload_detection_model: bool = False, force_reload: bool = False) -> KBVQA:
     """