FreedomIntelligence
/

ALLaVA-Phi3-mini-128k

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import math
+import sys
+import pdb
+from typing import Dict, Any
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
+                        #  MistralConfig, MistralModel, MistralForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.cache_utils import Cache, DynamicCache
+from .llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+from .modeling_phi3 import Phi3ForCausalLM, Phi3Model, Phi3Config
+from .generation_utils import build_allava_input
+################ Phi ###############################
+class LlavaPhi3Config(Phi3Config):
+    model_type = "llava_phi3"
+class LlavaPhi3Model(LlavaMetaModel, Phi3Model):
+    config_class = LlavaPhi3Config
+    def __init__(self, config: Phi3Config):
+        super(LlavaPhi3Model, self).__init__(config)
+class LlavaPhi3ForCausalLM(Phi3ForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaPhi3Config
+    def __init__(self, config, init_vision_encoder_from_ckpt=True):
+        config.flash_attn = True
+        config.flash_rotary = True
+        config.fused_dense = True
+        config._attn_implementation = "flash_attention_2"
+        super(Phi3ForCausalLM, self).__init__(config)
+        # self.model is used in LlavaMetaForCausalLM.get_model(); self.transformer is used in PhiForCausalLM.forward()
+        self.model = LlavaPhi3Model(config)
+        # self.model.embd =
+        if hasattr(self.model, '_use_flash_attention_2'):
+            assert self.model._use_flash_attention_2, 'flash attn is not enabled. check it out!'
+        # self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        if init_vision_encoder_from_ckpt:
+            vision_tower = self.get_vision_tower()
+            print(f'loading from CLIP first. This should only be used at inference!!!')
+            vision_tower.load_model() #
+        # Initialize weights and apply final processing
+        self.post_init()
+    # ############ these two methods are missing in modeling_phi.py
+    # def get_input_embeddings(self) -> nn.Embedding:
+    #     return self.model.embd.wte
+    # def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
+    #     self.model.embd.wte = new_embeddings
+    # ############ these two methods are missing in modeling_phi.py
+    def get_model(self):
+        return self.model
+    def get_tokenizer(self):
+        return self.tokenizer
+    def get_processor(self):
+        return self.model.vision_tower.image_processor
+    def set_tokenizer_eos_id(self):
+        eos_token_id = 30027 # only for llava_phi3
+        self.tokenizer.eos_token_id = eos_token_id
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        # pdb.set_trace()
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            # ) = self.prepare_inputs_labels_for_multimodal(
+            ) = self.prepare_inputs_labels_for_multimodal_new(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images
+            )
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) :
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal_new(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        # print(inputs_embeds.shape)
+        return super().generate(
+            position_ids=None,
+            attention_mask=None,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs):
+        '''
+        This function is called for each token at inference
+        '''
+        # pdb.set_trace()
+        images = kwargs.pop("images", None)
+        ####################################################
+        # lines from modeling_phi.py
+        ####################################################
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            elif past_length >= input_ids.shape[1]:
+                input_ids = input_ids[:, [-1]] # only keep the last one!
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        ####################################################
+        # end of lines from modeling_phi.py
+        ####################################################
+        if images is not None:
+            model_inputs['images'] = images
+        return model_inputs
+    # def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+    #     images = kwargs.pop("images", None)
+    #     _inputs = super().prepare_inputs_for_generation(
+    #         input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+    #     )
+    #     if images is not None:
+    #         _inputs['images'] = images
+    #     return _inputs
+    def chat(
+        self,
+        texts: Optional[str | list[list[str, str]]],
+        images: Optional[str | list[str]] = None,
+        history: Optional[list[str]] = None,
+        stream = False,
+        return_history = False,
+        **kwargs
+    ):
+        '''
+        texts: if `str`, then generate for a single round; if list[dict],
+        images: str (optional), local path to an image.
+        '''
+        use_cache = kwargs.pop('use_cache', True)
+        if 'eos_token_id' in kwargs:
+            _ = kwargs.pop('eos_token_id', None)
+            print(f'eos_token_id {_} from gen_kwargs is popped since it is not needed.')
+        # pdb.set_trace()
+        ############################
+        # merge history
+        ############################
+        input_ids, image_tensors, history = build_allava_input(
+            tokenizer = self.get_tokenizer(),
+            processor = self.get_processor(),
+            texts = texts,
+            images = images,
+            history=history,
+            return_history=return_history,
+            device = self.device
+        )
+        ############################
+        # generate response
+        ############################
+        # with torch.autocast(device_type='cuda'):
+        if 'cuda' in str(self.device):
+            device_type = 'cuda'
+        else:
+            device_type = 'cpu'
+        with torch.autocast(device_type=device_type, dtype=self.dtype):
+            output_ids = self.generate(
+                inputs=input_ids,
+                images=image_tensors,
+                use_cache=use_cache,
+                **kwargs)
+        answer = self.get_tokenizer().decode(output_ids[0, :], skip_special_tokens=True).strip()
+        if return_history:
+            history[-1][-1] = answer
+            return answer, history
+        return answer
+AutoConfig.register("llava_phi3", LlavaPhi3Config)
+AutoModelForCausalLM.register(LlavaPhi3Config, LlavaPhi3ForCausalLM)