Spaces:

RaviNaik
/

MultiModal-Phi2

Running

App Files Files Community

ravi.naik commited on Jan 25

Commit

667ae00

•

1 Parent(s): e752318

Fixed relative import issues

Browse files

Files changed (7) hide show

.gitignore +160 -0
README.md +32 -1
inference/model/builder.py +103 -44
inference/model/language_model/configuration_llava_phi.py +41 -29
inference/model/llava_arch.py +161 -39
inference/model/multimodal_encoder/clip_encoder.py +1 -1
requirements.txt +0 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -9,5 +9,36 @@ app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 license: mit
 ---
+## Phi2 : Multimodal Finetuning
+### Details
+1. LLM Backbone: Phi2
+2. Vision Tower: clip-vit-large-patch14-336
+3. Audio Model: Whisper
+4. Pretraining Dataset: LAION-CC-SBU dataset with BLIP captions(200k samples)
+5. Finetuning Dataset: Instruct 150k dataset based on COCO
+### Design
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/56df24cd-2681-4e17-ab64-9652f609b15f)
+### Pretraining
+#### Training Loss Curve
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/b6c37a95-0a56-4b52-8719-3ff56dc1b703)
+#### Learing Rate
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/44d9a11b-b28d-47e1-ba1d-d6dc22ebe748)
+#### Training Logs
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/76543d98-d9fe-4c1a-ac47-3d06e48053ad)
+### Finetuning
+#### Training Loss Curve
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/45ef40bd-fae5-4cfe-a522-c0eed2833230)
+#### Learing Rate
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/df60ee62-a537-4e36-a7f7-f7111e101162)
+#### Training Logs
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/2747acce-bc99-4c37-a05a-d5e81cb9aa9d)
+### Results
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/f12a9f04-df32-413e-b957-774c30381b2b)

inference/model/builder.py CHANGED Viewed

@@ -1,105 +1,162 @@
 import os
 import warnings
-import shutil
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig, CLIPImageProcessor
 import torch
-from llava_phi.model import *
-from llava_phi.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
-def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="cuda", device="cuda"):
     kwargs = {"device_map": device_map}
     if load_8bit:
-        kwargs['load_in_8bit'] = True
     elif load_4bit:
-        kwargs['load_in_4bit'] = True
-        kwargs['quantization_config'] = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_compute_dtype=torch.float16,
             bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type='nf4'
         )
     # else:  # TODO: after fine-tuning LLava-Phi, load the model weights with fp16 will pose nan
     #     kwargs['torch_dtype'] = torch.float16
-    if 'phi' in model_name.lower():
         # Load LLaVA-Phi model
-        if 'lora' in model_name.lower() and model_base is None:
-            warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument.')
-        if 'lora' in model_name.lower() and model_base is not None:
             lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
             tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
-            print('Loading LLaVA-Phi from base model...')
-            model = LlavaPhiForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
             token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
             if model.lm_head.weight.shape[0] != token_num:
-                model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
-                model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
-            print('Loading additional LLaVA-Phi weights...')
-            if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
-                non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
             else:
                 # this is probably from HF Hub
                 from huggingface_hub import hf_hub_download
                 def load_from_hf(repo_id, filename, subfolder=None):
                     cache_file = hf_hub_download(
-                        repo_id=repo_id,
-                        filename=filename,
-                        subfolder=subfolder)
-                    return torch.load(cache_file, map_location='cpu')
-                non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
-            non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
-            if any(k.startswith('model.model.') for k in non_lora_trainables):
-                non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
             model.load_state_dict(non_lora_trainables, strict=False)
             from peft import PeftModel
-            print('Loading LoRA weights...')
             model = PeftModel.from_pretrained(model, model_path)
-            print('Merging LoRA weights...')
             model = model.merge_and_unload()
-            print('Model is loaded...')
         elif model_base is not None:
             # this may be mm projector only
-            print('Loading LLaVA-Phi from base model...')
             tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
             cfg_pretrained = AutoConfig.from_pretrained(model_path)
-            model = LlavaPhiForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
-            mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
-            mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
             model.load_state_dict(mm_projector_weights, strict=False)
         else:
             print("load llaVA-Phi MLLM!!!")
             config = LlavaPhiConfig.from_pretrained(model_path, trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
             model = LlavaPhiForCausalLM.from_pretrained(
-                model_path,
-                config=config,
-                use_safetensors=True,
-                **kwargs).to("cuda")
     else:
         # Load language model
         if model_base is not None:
             # PEFT model
             from peft import PeftModel
             tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
-            model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
             print(f"Loading LoRA weights from {model_path}")
             model = PeftModel.from_pretrained(model, model_path)
             print(f"Merging weights")
             model = model.merge_and_unload()
-            print('Convert to FP16...')
             model.to(torch.float16)
         else:
             tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
-            model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
     image_processor = CLIPImageProcessor.from_pretrained(model_path)
-    if 'phi' in model_name.lower():
         mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
         mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
@@ -107,7 +164,9 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
         if mm_use_im_patch_token:
             tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
         if mm_use_im_start_end:
-            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
             # model.resize_token_embeddings(len(tokenizer))
     else:
         raise ValueError(f"Unsupported model name: {model_name}")

 import os
 import warnings
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    AutoConfig,
+    BitsAndBytesConfig,
+    CLIPImageProcessor,
+)
 import torch
+from .language_model.llava_phi import LlavaPhiForCausalLM
+from .language_model.configuration_llava_phi import LlavaPhiConfig
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+def load_pretrained_model(
+    model_path,
+    model_base,
+    model_name,
+    load_8bit=False,
+    load_4bit=False,
+    device_map="cuda",
+    device="cuda",
+):
     kwargs = {"device_map": device_map}
     if load_8bit:
+        kwargs["load_in_8bit"] = True
     elif load_4bit:
+        kwargs["load_in_4bit"] = True
+        kwargs["quantization_config"] = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_compute_dtype=torch.float16,
             bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
         )
     # else:  # TODO: after fine-tuning LLava-Phi, load the model weights with fp16 will pose nan
     #     kwargs['torch_dtype'] = torch.float16
+    if "phi" in model_name.lower():
         # Load LLaVA-Phi model
+        if "lora" in model_name.lower() and model_base is None:
+            warnings.warn(
+                "There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument."
+            )
+        if "lora" in model_name.lower() and model_base is not None:
             lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
             tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            print("Loading LLaVA-Phi from base model...")
+            model = LlavaPhiForCausalLM.from_pretrained(
+                model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs
+            )
             token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
             if model.lm_head.weight.shape[0] != token_num:
+                model.lm_head.weight = torch.nn.Parameter(
+                    torch.empty(
+                        token_num, tokem_dim, device=model.device, dtype=model.dtype
+                    )
+                )
+                model.model.embed_tokens.weight = torch.nn.Parameter(
+                    torch.empty(
+                        token_num, tokem_dim, device=model.device, dtype=model.dtype
+                    )
+                )
+            print("Loading additional LLaVA-Phi weights...")
+            if os.path.exists(os.path.join(model_path, "non_lora_trainables.bin")):
+                non_lora_trainables = torch.load(
+                    os.path.join(model_path, "non_lora_trainables.bin"),
+                    map_location="cpu",
+                )
             else:
                 # this is probably from HF Hub
                 from huggingface_hub import hf_hub_download
                 def load_from_hf(repo_id, filename, subfolder=None):
                     cache_file = hf_hub_download(
+                        repo_id=repo_id, filename=filename, subfolder=subfolder
+                    )
+                    return torch.load(cache_file, map_location="cpu")
+                non_lora_trainables = load_from_hf(
+                    model_path, "non_lora_trainables.bin"
+                )
+            non_lora_trainables = {
+                (k[11:] if k.startswith("base_model.") else k): v
+                for k, v in non_lora_trainables.items()
+            }
+            if any(k.startswith("model.model.") for k in non_lora_trainables):
+                non_lora_trainables = {
+                    (k[6:] if k.startswith("model.") else k): v
+                    for k, v in non_lora_trainables.items()
+                }
             model.load_state_dict(non_lora_trainables, strict=False)
             from peft import PeftModel
+            print("Loading LoRA weights...")
             model = PeftModel.from_pretrained(model, model_path)
+            print("Merging LoRA weights...")
             model = model.merge_and_unload()
+            print("Model is loaded...")
         elif model_base is not None:
             # this may be mm projector only
+            print("Loading LLaVA-Phi from base model...")
             tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
             cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            model = LlavaPhiForCausalLM.from_pretrained(
+                model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs
+            )
+            mm_projector_weights = torch.load(
+                os.path.join(model_path, "mm_projector.bin"), map_location="cpu"
+            )
+            mm_projector_weights = {
+                k: v.to(torch.float16) for k, v in mm_projector_weights.items()
+            }
             model.load_state_dict(mm_projector_weights, strict=False)
         else:
             print("load llaVA-Phi MLLM!!!")
             config = LlavaPhiConfig.from_pretrained(model_path, trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
             model = LlavaPhiForCausalLM.from_pretrained(
+                model_path, config=config, use_safetensors=True, **kwargs
+            ).to("cuda")
     else:
         # Load language model
         if model_base is not None:
             # PEFT model
             from peft import PeftModel
             tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_base,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+                device_map="auto",
+            )
             print(f"Loading LoRA weights from {model_path}")
             model = PeftModel.from_pretrained(model, model_path)
             print(f"Merging weights")
             model = model.merge_and_unload()
+            print("Convert to FP16...")
             model.to(torch.float16)
         else:
             tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path, low_cpu_mem_usage=True, **kwargs
+            )
     image_processor = CLIPImageProcessor.from_pretrained(model_path)
+    if "phi" in model_name.lower():
         mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
         mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
         if mm_use_im_patch_token:
             tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
         if mm_use_im_start_end:
+            tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
             # model.resize_token_embeddings(len(tokenizer))
     else:
         raise ValueError(f"Unsupported model name: {model_name}")

inference/model/language_model/configuration_llava_phi.py CHANGED Viewed

@@ -68,23 +68,23 @@ class LlavaPhiVisionConfig(PretrainedConfig):
     model_type = "llava_phi_clip_vision_model"
     def __init__(
-            self,
-            hidden_size=768,
-            intermediate_size=3072,
-            projection_dim=512,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            num_channels=3,
-            image_size=224,
-            patch_size=32,
-            hidden_act="quick_gelu",
-            layer_norm_eps=1e-5,
-            attention_dropout=0.0,
-            initializer_range=0.02,
-            initializer_factor=1.0,
-            mm_vision_select_feature="patch",
-            mm_vision_select_layer=-2,
-            **kwargs,
     ):
         super().__init__(**kwargs)
@@ -105,16 +105,24 @@ class LlavaPhiVisionConfig(PretrainedConfig):
         self.mm_vision_select_layer = mm_vision_select_layer
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
         cls._set_token_in_kwargs(kwargs)
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         # get the vision config dict if we are loading from CLIPConfig
         if config_dict.get("model_type") == "llava_phi-phi":
             config_dict = config_dict["vision_config"]
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -127,11 +135,7 @@ class ProjectorConfig(PretrainedConfig):
     model_type = "llava_phi_projector"
     def __init__(
-            self,
-            mm_projector_type="linear",
-            mm_hidden_size=768,
-            hidden_size=2560,
-            **kwargs
     ):
         self.mm_projector_type = mm_projector_type
         self.mm_hidden_size = mm_hidden_size
@@ -139,16 +143,24 @@ class ProjectorConfig(PretrainedConfig):
         super().__init__(**kwargs)
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
         cls._set_token_in_kwargs(kwargs)
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         # get the vision config dict if we are loading from CLIPConfig
         if config_dict.get("model_type") == "llava_phi-phi":
             config_dict = config_dict["projector_config"]
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -159,7 +171,7 @@ class ProjectorConfig(PretrainedConfig):
 DEFAULT_VISUAL_CONFIG = {
     "vision_tower": LlavaPhiVisionConfig().to_dict(),
-    "mm_projector": ProjectorConfig().to_dict()
 }

     model_type = "llava_phi_clip_vision_model"
     def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        mm_vision_select_feature="patch",
+        mm_vision_select_layer=-2,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.mm_vision_select_layer = mm_vision_select_layer
     @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
         cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
         # get the vision config dict if we are loading from CLIPConfig
         if config_dict.get("model_type") == "llava_phi-phi":
             config_dict = config_dict["vision_config"]
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
     model_type = "llava_phi_projector"
     def __init__(
+        self, mm_projector_type="linear", mm_hidden_size=768, hidden_size=2560, **kwargs
     ):
         self.mm_projector_type = mm_projector_type
         self.mm_hidden_size = mm_hidden_size
         super().__init__(**kwargs)
     @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
         cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
         # get the vision config dict if we are loading from CLIPConfig
         if config_dict.get("model_type") == "llava_phi-phi":
             config_dict = config_dict["projector_config"]
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
 DEFAULT_VISUAL_CONFIG = {
     "vision_tower": LlavaPhiVisionConfig().to_dict(),
+    "mm_projector": ProjectorConfig().to_dict(),
 }

inference/model/llava_arch.py CHANGED Viewed

@@ -19,8 +19,19 @@ import torch
 from .multimodal_encoder.clip_encoder import CLIPVisionTower
 from .multimodal_projector.builder import build_vision_projector
-from .language_model.configuration_llava_phi import LlavaPhiConfig, LlavaPhiVisionConfig, ProjectorConfig
-from llava_phi.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 class LlavaMetaModel:
@@ -34,14 +45,13 @@ class LlavaMetaModel:
         )
     def get_vision_tower(self):
-        vision_tower = getattr(self, 'vision_tower', None)
         if type(vision_tower) is list:
             vision_tower = vision_tower[0]
         return vision_tower
 class LlavaMetaForCausalLM(ABC):
     @abstractmethod
     def get_model(self):
         pass
@@ -59,8 +69,17 @@ class LlavaMetaForCausalLM(ABC):
     ):
         vision_tower = self.get_vision_tower()
         if vision_tower is None or images is None or input_ids.shape[1] == 1:
-            if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1:
-                attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
             return input_ids, attention_mask, past_key_values, None, labels
         if type(images) is list or images.ndim == 5:
@@ -81,9 +100,16 @@ class LlavaMetaForCausalLM(ABC):
                 # FIXME: this is a hacky fix, for deepspeed zero3 to work
                 half_len = cur_input_ids.shape[0] // 2
                 cur_image_features = image_features[cur_image_idx]
-                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
-                cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
-                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0], cur_input_embeds_2], dim=0)
                 new_input_embeds.append(cur_input_embeds)
                 if labels is not None:
                     new_labels.append(labels[batch_idx])
@@ -98,37 +124,79 @@ class LlavaMetaForCausalLM(ABC):
             while image_token_indices.numel() > 0:
                 cur_image_features = image_features[cur_image_idx]
                 image_token_start = image_token_indices[0]
-                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
-                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start-1]).detach())
-                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start-1:image_token_start]))
                     cur_new_input_embeds.append(cur_image_features)
-                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start+1:image_token_start+2]))
                     if labels is not None:
                         cur_new_labels.append(cur_labels[:image_token_start])
-                        cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
-                        cur_new_labels.append(cur_labels[image_token_start:image_token_start+1])
-                        cur_labels = cur_labels[image_token_start+2:]
                 else:
-                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start]))
                     cur_new_input_embeds.append(cur_image_features)
                     if labels is not None:
                         cur_new_labels.append(cur_labels[:image_token_start])
-                        cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
-                        cur_labels = cur_labels[image_token_start+1:]
                 cur_image_idx += 1
-                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
-                    cur_input_ids = cur_input_ids[image_token_start+2:]
                 else:
-                    cur_input_ids = cur_input_ids[image_token_start+1:]
                 image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
             if cur_input_ids.numel() > 0:
-                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
-                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids).detach())
                 else:
-                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
                 if labels is not None:
                     cur_new_labels.append(cur_labels)
-            cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
             cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
             new_input_embeds.append(cur_new_input_embeds)
             if labels is not None:
@@ -140,7 +208,17 @@ class LlavaMetaForCausalLM(ABC):
             new_input_embeds_align = []
             for cur_new_embed in new_input_embeds:
-                cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
                 new_input_embeds_align.append(cur_new_embed)
             new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
@@ -148,27 +226,67 @@ class LlavaMetaForCausalLM(ABC):
                 new_labels_align = []
                 _new_labels = new_labels
                 for cur_new_label in new_labels:
-                    cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
                     new_labels_align.append(cur_new_label)
                 new_labels = torch.stack(new_labels_align, dim=0)
             if attention_mask is not None:
                 new_attention_mask = []
-                for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
-                    new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
-                    new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
-                    cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
                     new_attention_mask.append(cur_new_attention_mask)
                 attention_mask = torch.stack(new_attention_mask, dim=0)
                 assert attention_mask.shape == new_labels.shape
         else:
             new_input_embeds = torch.stack(new_input_embeds, dim=0)
             if labels is not None:
-                new_labels  = torch.stack(new_labels, dim=0)
             if attention_mask is not None:
-                new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
-                attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
                 assert attention_mask.shape == new_input_embeds.shape[:2]
         return None, attention_mask, past_key_values, new_input_embeds, new_labels
@@ -179,7 +297,9 @@ class LlavaMetaForCausalLM(ABC):
             self.resize_token_embeddings(len(tokenizer))
         if model_args.mm_use_im_start_end:
-            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
             self.resize_token_embeddings(len(tokenizer))
             if num_new_tokens > 0:
@@ -187,9 +307,11 @@ class LlavaMetaForCausalLM(ABC):
                 output_embeddings = self.get_output_embeddings().weight.data
                 input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
-                    dim=0, keepdim=True)
                 output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
-                    dim=0, keepdim=True)
                 input_embeddings[-num_new_tokens:] = input_embeddings_avg
                 output_embeddings[-num_new_tokens:] = output_embeddings_avg
@@ -199,7 +321,7 @@ class LlavaMetaForCausalLM(ABC):
                     p.requires_grad = True
                 for p in self.get_output_embeddings().parameters():
                     p.requires_grad = False
         elif model_args.mm_use_im_patch_token:
             if model_args.tune_mm_mlp_adapter:
                 for p in self.get_input_embeddings().parameters():

 from .multimodal_encoder.clip_encoder import CLIPVisionTower
 from .multimodal_projector.builder import build_vision_projector
+from .language_model.configuration_llava_phi import (
+    LlavaPhiConfig,
+    LlavaPhiVisionConfig,
+    ProjectorConfig,
+)
+# from llava_phi.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
 class LlavaMetaModel:
         )
     def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
         if type(vision_tower) is list:
             vision_tower = vision_tower[0]
         return vision_tower
 class LlavaMetaForCausalLM(ABC):
     @abstractmethod
     def get_model(self):
         pass
     ):
         vision_tower = self.get_vision_tower()
         if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if (
+                past_key_values is not None
+                and vision_tower is not None
+                and images is not None
+                and input_ids.shape[1] == 1
+            ):
+                attention_mask = torch.ones(
+                    (attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
             return input_ids, attention_mask, past_key_values, None, labels
         if type(images) is list or images.ndim == 5:
                 # FIXME: this is a hacky fix, for deepspeed zero3 to work
                 half_len = cur_input_ids.shape[0] // 2
                 cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(
+                    cur_input_ids[:half_len]
+                )
+                cur_input_embeds_2 = self.get_model().embed_tokens(
+                    cur_input_ids[half_len:]
+                )
+                cur_input_embeds = torch.cat(
+                    [cur_input_embeds_1, cur_image_features[0:0], cur_input_embeds_2],
+                    dim=0,
+                )
                 new_input_embeds.append(cur_input_embeds)
                 if labels is not None:
                     new_labels.append(labels[batch_idx])
             while image_token_indices.numel() > 0:
                 cur_image_features = image_features[cur_image_idx]
                 image_token_start = image_token_indices[0]
+                if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+                    self.config, "mm_use_im_start_end", False
+                ):
+                    cur_new_input_embeds.append(
+                        self.get_model()
+                        .embed_tokens(cur_input_ids[: image_token_start - 1])
+                        .detach()
+                    )
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(
+                            cur_input_ids[image_token_start - 1 : image_token_start]
+                        )
+                    )
                     cur_new_input_embeds.append(cur_image_features)
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(
+                            cur_input_ids[image_token_start + 1 : image_token_start + 2]
+                        )
+                    )
                     if labels is not None:
                         cur_new_labels.append(cur_labels[:image_token_start])
+                        cur_new_labels.append(
+                            torch.full(
+                                (cur_image_features.shape[0],),
+                                IGNORE_INDEX,
+                                device=labels.device,
+                                dtype=labels.dtype,
+                            )
+                        )
+                        cur_new_labels.append(
+                            cur_labels[image_token_start : image_token_start + 1]
+                        )
+                        cur_labels = cur_labels[image_token_start + 2 :]
                 else:
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(cur_input_ids[:image_token_start])
+                    )
                     cur_new_input_embeds.append(cur_image_features)
                     if labels is not None:
                         cur_new_labels.append(cur_labels[:image_token_start])
+                        cur_new_labels.append(
+                            torch.full(
+                                (cur_image_features.shape[0],),
+                                IGNORE_INDEX,
+                                device=labels.device,
+                                dtype=labels.dtype,
+                            )
+                        )
+                        cur_labels = cur_labels[image_token_start + 1 :]
                 cur_image_idx += 1
+                if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+                    self.config, "mm_use_im_start_end", False
+                ):
+                    cur_input_ids = cur_input_ids[image_token_start + 2 :]
                 else:
+                    cur_input_ids = cur_input_ids[image_token_start + 1 :]
                 image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
             if cur_input_ids.numel() > 0:
+                if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+                    self.config, "mm_use_im_start_end", False
+                ):
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(cur_input_ids).detach()
+                    )
                 else:
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(cur_input_ids)
+                    )
                 if labels is not None:
                     cur_new_labels.append(cur_labels)
+            cur_new_input_embeds = [
+                x.to(device=self.device) for x in cur_new_input_embeds
+            ]
             cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
             new_input_embeds.append(cur_new_input_embeds)
             if labels is not None:
             new_input_embeds_align = []
             for cur_new_embed in new_input_embeds:
+                cur_new_embed = torch.cat(
+                    (
+                        cur_new_embed,
+                        torch.zeros(
+                            (max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]),
+                            dtype=cur_new_embed.dtype,
+                            device=cur_new_embed.device,
+                        ),
+                    ),
+                    dim=0,
+                )
                 new_input_embeds_align.append(cur_new_embed)
             new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
                 new_labels_align = []
                 _new_labels = new_labels
                 for cur_new_label in new_labels:
+                    cur_new_label = torch.cat(
+                        (
+                            cur_new_label,
+                            torch.full(
+                                (max_len - cur_new_label.shape[0],),
+                                IGNORE_INDEX,
+                                dtype=cur_new_label.dtype,
+                                device=cur_new_label.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
                     new_labels_align.append(cur_new_label)
                 new_labels = torch.stack(new_labels_align, dim=0)
             if attention_mask is not None:
                 new_attention_mask = []
+                for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(
+                    attention_mask, _new_labels, new_labels
+                ):
+                    new_attn_mask_pad_left = torch.full(
+                        (cur_new_labels.shape[0] - labels.shape[1],),
+                        True,
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    new_attn_mask_pad_right = torch.full(
+                        (cur_new_labels_align.shape[0] - cur_new_labels.shape[0],),
+                        False,
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    cur_new_attention_mask = torch.cat(
+                        (
+                            new_attn_mask_pad_left,
+                            cur_attention_mask,
+                            new_attn_mask_pad_right,
+                        ),
+                        dim=0,
+                    )
                     new_attention_mask.append(cur_new_attention_mask)
                 attention_mask = torch.stack(new_attention_mask, dim=0)
                 assert attention_mask.shape == new_labels.shape
         else:
             new_input_embeds = torch.stack(new_input_embeds, dim=0)
             if labels is not None:
+                new_labels = torch.stack(new_labels, dim=0)
             if attention_mask is not None:
+                new_attn_mask_pad_left = torch.full(
+                    (
+                        attention_mask.shape[0],
+                        new_input_embeds.shape[1] - input_ids.shape[1],
+                    ),
+                    True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                attention_mask = torch.cat(
+                    (new_attn_mask_pad_left, attention_mask), dim=1
+                )
                 assert attention_mask.shape == new_input_embeds.shape[:2]
         return None, attention_mask, past_key_values, new_input_embeds, new_labels
             self.resize_token_embeddings(len(tokenizer))
         if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
             self.resize_token_embeddings(len(tokenizer))
             if num_new_tokens > 0:
                 output_embeddings = self.get_output_embeddings().weight.data
                 input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
                 output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
                 input_embeddings[-num_new_tokens:] = input_embeddings_avg
                 output_embeddings[-num_new_tokens:] = output_embeddings_avg
                     p.requires_grad = True
                 for p in self.get_output_embeddings().parameters():
                     p.requires_grad = False
         elif model_args.mm_use_im_patch_token:
             if model_args.tune_mm_mlp_adapter:
                 for p in self.get_input_embeddings().parameters():

inference/model/multimodal_encoder/clip_encoder.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch.nn as nn
 from transformers import CLIPPreTrainedModel, CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPVisionTransformer
-from llava_phi.model.language_model.configuration_llava_phi import LlavaPhiVisionConfig
 class CLIPVisionTower(CLIPPreTrainedModel):

 from transformers import CLIPPreTrainedModel, CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPVisionTransformer
+from inference.model.language_model.configuration_llava_phi import LlavaPhiVisionConfig
 class CLIPVisionTower(CLIPPreTrainedModel):

requirements.txt CHANGED Viewed

@@ -6,7 +6,6 @@ gradio_client==0.2.9
 markdown2[all]
 numpy
 requests
-sentencepiece
 tokenizers==0.15.0
 torch==2.0.1
 shortuuid

 markdown2[all]
 numpy
 requests
 tokenizers==0.15.0
 torch==2.0.1
 shortuuid