Spaces:

miguelcarv
/

Pheye

Runtime error

App Files Files Community

miguelcarv commited on May 12

Commit

34f251f

•

1 Parent(s): fb2630f

first commit

Browse files

Files changed (16) hide show

app.py +95 -55
pheye_builder/__init__.py +1 -0
pheye_builder/__pycache__/__init__.cpython-311.pyc +0 -0
pheye_builder/__pycache__/encoder.cpython-311.pyc +0 -0
pheye_builder/__pycache__/factory.cpython-311.pyc +0 -0
pheye_builder/__pycache__/phEYE.cpython-311.pyc +0 -0
pheye_builder/__pycache__/utils.cpython-311.pyc +0 -0
pheye_builder/__pycache__/wrapper_lm.cpython-311.pyc +0 -0
pheye_builder/__pycache__/xattn.cpython-311.pyc +0 -0
pheye_builder/encoder.py +179 -0
pheye_builder/factory.py +126 -0
pheye_builder/phEYE.py +220 -0
pheye_builder/utils.py +48 -0
pheye_builder/wrapper_lm.py +132 -0
pheye_builder/xattn.py +159 -0
requirements.txt +8 -1

app.py CHANGED Viewed

@@ -1,63 +1,103 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
     ],
 )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from huggingface_hub import InferenceClient
+import json
+from pheye_builder import create_model_and_transforms
+from huggingface_hub import hf_hub_download
+import torch
+from PIL import Image
+import os
+import requests
+def get_config(hf_model_path):
+    config_path = hf_hub_download(hf_model_path, "config.json")
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    return config
+def get_model_path(hf_model_path):
+    return hf_hub_download(hf_model_path, "checkpoint.pt")
+HF_MODEL = "miguelcarv/Pheye-x2-672"
+config = get_config(HF_MODEL)
+print("Got config")
+model, tokenizer = create_model_and_transforms(
+            clip_vision_encoder_path=config["encoder"],
+            lang_decoder_path=config["decoder"],
+            tokenizer_path=config["tokenizer"],
+            cross_attn_every_n_layers=config["cross_interval"],
+            level=config["level"],
+            reduce_factor=config["reduce"],
+            from_layer=config["from_layer"],
+            encoder_dtype=eval(config["encoder_dtype"]),
+            decoder_dtype=eval(config["decoder_dtype"]),
+            dtype=eval(config["other_params_dtype"])
+        )
+if config["first_level"]:
+    model.vision_encoder.add_first_level_adapter()
+print("Created model")
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model_path = get_model_path(HF_MODEL)
+model.load_state_dict(torch.load(model_path, map_location="cpu"))
+model = model.to(DEVICE)
+print("Loaded model")
+SYSTEM_PROMPT = "You are an AI visual assistant and you are seeing a single image. You will receive an instruction regarding that image. Your goal is to follow the instruction as faithfully as you can."
+whiteboard = Image.open(requests.get("https://c1.staticflickr.com/7/6168/6207108414_a8833f410e_o.jpg", stream=True).raw).convert('RGB')
+taxi_image = Image.open(requests.get("https://llava.hliu.cc/file=/nobackup/haotian/tmp/gradio/ca10383cc943e99941ecffdc4d34c51afb2da472/extreme_ironing.jpg", stream=True).raw).convert('RGB')
+def generate_answer(img, question, max_new_tokens, num_beams):
+    image = [img]
+    prompt = [f"{SYSTEM_PROMPT}\n\nInstruction: {question}\nOutput:"]
+    inputs = tokenizer(prompt, padding='longest', return_tensors='pt')
+    print("Generating a response with the following parameters:")
+    print(f"""Question: {question}\nMax New Tokens: {max_new_tokens}\nNum Beams: {num_beams}""")
+    model.eval()
+    with torch.no_grad():
+        outputs = model.generate(vision_x=image,
+                                lang_x=inputs.input_ids.to(DEVICE),
+                                device=DEVICE,
+                                max_new_tokens=max_new_tokens,
+                                num_beams = num_beams,
+                                eos_token_id = tokenizer.eos_token_id,
+                                pad_token_id = tokenizer.pad_token_id,
+                                attention_mask=inputs.attention_mask.to(DEVICE))
+        answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].split("Output:")[-1].lstrip()
+    return answer
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=generate_answer,
+    inputs=[
+        gr.Image(type="pil", label="Image"),
+        gr.Textbox(label="Question"),
+        gr.Slider(minimum=5, maximum=500, step=1, value=50, label="Max New Tokens"),
+        gr.Slider(minimum=1, maximum=5, step=1, value=3, label="Num Beams")
     ],
+    outputs=gr.Textbox(label="Answer"),
+    title="<h1 style='text-align: center; display: block;'>Pheye-x2 672x672 pixels</h1>",
+    examples=[[taxi_image, "What is unusual about this image?"], [whiteboard, "What is the main topic of the whiteboard?"]]
 )
 if __name__ == "__main__":
+    # Launch the Gradio app
+    iface.launch()

pheye_builder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .factory import create_model_and_transforms

pheye_builder/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (250 Bytes). View file

pheye_builder/__pycache__/encoder.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

pheye_builder/__pycache__/factory.cpython-311.pyc ADDED Viewed

Binary file (5.9 kB). View file

pheye_builder/__pycache__/phEYE.cpython-311.pyc ADDED Viewed

Binary file (10.2 kB). View file

pheye_builder/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (2.28 kB). View file

pheye_builder/__pycache__/wrapper_lm.cpython-311.pyc ADDED Viewed

Binary file (7.12 kB). View file

pheye_builder/__pycache__/xattn.cpython-311.pyc ADDED Viewed

Binary file (7.7 kB). View file

pheye_builder/encoder.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from transformers import CLIPModel
+from torch import nn
+from peft import LoraConfig, get_peft_model
+import torch
+from torch import nn
+import PIL
+from PIL.Image import BICUBIC
+import math
+from torchvision import transforms
+import torch.nn.functional as F
+# level 4 which has 21 patches was being used in previous experiments so now I can't remove it or won't be able to load older models....
+LEVELS_TO_PATCHES = {
+    1 : 1,
+    2 : 5,
+    3 : 10,
+    4 : 21
+}
+def cut_image_patches(image: PIL.Image, encoder_resolution: int = 224):
+    coordinates = []
+    width, height = image.size
+    width_tiles = [i*encoder_resolution for i in range(math.ceil(width/encoder_resolution)-1)]
+    width_tiles.append(width-encoder_resolution)
+    height_tiles = [i*encoder_resolution for i in range(math.ceil(height/encoder_resolution)-1)]
+    height_tiles.append(height-encoder_resolution)
+    for w in width_tiles:
+        for h in height_tiles:
+            coordinates.append((w,h,w+encoder_resolution,h+encoder_resolution))
+    cropped_images = [image.crop(c) for c in coordinates]
+    return cropped_images
+class Encoder(nn.Module):
+    def __init__(self, clip_name, level = 2, dtype = None, use_dropout = True) -> None:
+        super().__init__()
+        if level not in LEVELS_TO_PATCHES:
+            raise ValueError("Resolution not supported")
+        self.n_patches = LEVELS_TO_PATCHES[level]
+        self.vision_model = CLIPModel.from_pretrained(clip_name, torch_dtype=dtype).vision_model
+        self.has_first_adapter = False
+        self.image_size = self.vision_model.config.image_size
+        self.patch_size = self.vision_model.config.patch_size
+        self.use_dropout = use_dropout
+        self.dtype = dtype
+        mean = (0.48145466, 0.4578275, 0.40821073)
+        std = (0.26862954, 0.26130258, 0.27577711)
+        self.image_transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+        self.norm_lvl_1 = nn.LayerNorm(self.vision_model.config.hidden_size, dtype=dtype)
+        self.norm_lvl_2 = nn.LayerNorm(self.vision_model.config.hidden_size, dtype=dtype)
+        # this was being used in previous experiments so now I can't remove it or won't be able to load older models....
+        self.norm_lvl_3 = nn.LayerNorm(self.vision_model.config.hidden_size, dtype=dtype)
+        if level == 1:
+            self.connector = nn.LayerNorm(self.vision_model.config.hidden_size, dtype=dtype)
+        else:
+            self.connector = Position(self.n_patches, self.vision_model.config.hidden_size, dtype=dtype)
+            config_level2 = LoraConfig(
+                r=16,
+                lora_alpha=32,
+                target_modules=["q_proj", "k_proj", "v_proj", "out_proj", "patch_embedding", "fc1", "fc2"],
+                lora_dropout=0.05 if self.use_dropout else 0,
+                bias="none"
+            )
+            self.vision_model = get_peft_model(self.vision_model, config_level2, "second")
+    def add_first_level_adapter(self):
+        config_224 = LoraConfig(
+            r=8,
+            lora_alpha=16,
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj", "patch_embedding", "fc1", "fc2"],
+            lora_dropout=0.05 if self.use_dropout else 0,
+            bias="none"
+        )
+        self.vision_model.add_adapter("first", config_224)
+        self.has_first_adapter = True
+    def forward(self, images: list, device = "cpu", **kwargs):
+        """
+        shape (B, C, H, W) in list form
+        """
+        B = len(images)
+        h = int((self.image_size/self.patch_size) ** 2 + 1)
+        resized_images = {1: [], 2: []}
+        for i in images:
+            resized_images[1].append(self.image_transform(i.resize((self.image_size,self.image_size), resample=BICUBIC)))
+            if self.n_patches == 5:
+                for crop in cut_image_patches(i.resize((self.image_size * 2,self.image_size * 2), resample=BICUBIC), encoder_resolution=self.image_size):
+                    resized_images[2].append(self.image_transform(crop))
+            elif self.n_patches == 10:
+                for crop in cut_image_patches(i.resize((self.image_size * 3,self.image_size * 3), resample=BICUBIC), encoder_resolution=self.image_size):
+                    resized_images[2].append(self.image_transform(crop))
+        vision_features = []
+        for res, imgs in resized_images.items():
+            if imgs != []:
+                resized_images[res] = torch.stack(imgs, dim = 0).to(device)
+                if res == 1 and self.has_first_adapter:
+                    self.vision_model.set_adapter("first")
+                    vision_features.append(self.norm_lvl_1(self.vision_model(resized_images[res]).last_hidden_state))
+                elif res == 1:
+                    with self.vision_model.disable_adapter():
+                        vision_features.append(self.norm_lvl_1(self.vision_model(resized_images[res]).last_hidden_state))
+                elif res == 2:
+                    self.vision_model.set_adapter("second")
+                    if self.n_patches == 5:
+                        vision_features.append(self.norm_lvl_2(self.vision_model(resized_images[res]).last_hidden_state.view(B, h * 4, -1)))
+                    elif self.n_patches == 10:
+                        vision_features.append(self.norm_lvl_2(self.vision_model(resized_images[res]).last_hidden_state.view(B, h * 9, -1)))
+        vision_features = torch.cat(vision_features, dim = 1)
+        vision_features = self.connector(vision_features)
+        return vision_features
+class Position(nn.Module):
+    def __init__(self, n_patches, dim, dtype) -> None:
+        super().__init__()
+        self.embedding = nn.Embedding(max(LEVELS_TO_PATCHES.values()), dim, dtype=dtype)
+        self.n_patches = n_patches
+        self.apply(self._init_weights)
+    def forward(self, vision_features):
+        batch_size, seq_len, dim = vision_features.size()
+        single_encoder_dim = seq_len // self.n_patches
+        device = vision_features.get_device()
+        pos = torch.LongTensor(list(range(self.n_patches))).to(device if device != -1 else "cpu")
+        pos = torch.repeat_interleave(self.embedding(pos).unsqueeze(0), single_encoder_dim, 1).expand(batch_size, -1, -1)
+        return vision_features + pos
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        for name, p in module.named_parameters():
+            if name == "fc1.weight" or name == "fc2.weight" or name == "to_out.weight":
+                p.data.normal_(mean=0.0, std=(0.02 / math.sqrt(2 * self.n_decoder_layers)))

pheye_builder/factory.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from typing import Optional
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+import torch
+from .phEYE import phEYE
+from .wrapper_lm import phEYELMMixin
+from .utils import extend_instance
+from .encoder import Encoder
+def create_model_and_transforms(
+    clip_vision_encoder_path: str,
+    lang_decoder_path: str,
+    tokenizer_path: str,
+    dtype,
+    cross_attn_every_n_layers: int = 1,
+    use_local_files: bool = False,
+    decoder_layers_attr_name: str = None,
+    freeze_lm_embeddings: bool = True,
+    cache_dir: Optional[str] = None,
+    level: int = 2,
+    encoder_dtype : torch.dtype = None,
+    decoder_dtype : torch.dtype = None,
+    use_dropout : bool = False,
+    **pheye_kwargs,
+):
+    """
+    Initialize a phEYE model from a pretrained vision encoder and language encoder.
+    Appends special tokens to the tokenizer and freezes backbones.
+    Args:
+        clip_vision_encoder_path (str): path to pretrained clip model (e.g. "ViT-B-32")
+        clip_vision_encoder_pretrained (str): name of pretraining dataset for clip model (e.g. "laion2b_s32b_b79k")
+        lang_encoder_path (str): path to pretrained language encoder
+        tokenizer_path (str): path to pretrained tokenizer
+        cross_attn_every_n_layers (int, optional): determines how often to add a cross-attention layer. Defaults to 1.
+        use_local_files (bool, optional): whether to use local files. Defaults to False.
+        decoder_layers_attr_name (str, optional): name of the decoder layers attribute. Defaults to None.
+        freeze_lm_embeddings (bool, optional): whether to freeze LM input embeddings when configuring Perceiver.
+        cache_dir (str, optional): path to cache directory for downloading OpenClip/HF weights.
+    Returns:
+        phEYE: phEYE model from pretrained vision and language encoders
+        Image processor: Pipeline to preprocess input images
+        Tokenizer: A tokenizer for the language model
+    """
+    vision_encoder = Encoder(clip_vision_encoder_path, level=level, dtype=encoder_dtype, use_dropout=use_dropout)
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    if text_tokenizer.pad_token is None:
+        text_tokenizer.pad_token = text_tokenizer.eos_token
+    #print(lang_decoder_path)
+    lang_config = AutoConfig.from_pretrained(lang_decoder_path)
+    #print(lang_config)
+    lang_encoder = AutoModelForCausalLM.from_config(
+        lang_config,
+        #local_files_only=use_local_files,
+        #trust_remote_code=True,
+        torch_dtype=decoder_dtype
+)
+    lang_encoder.config.decoder_start_token_id = None
+    lang_encoder.config.pad_token_id = text_tokenizer.pad_token_id
+    # convert LM to phEYELM
+    extend_instance(lang_encoder, phEYELMMixin)
+    if decoder_layers_attr_name is None:
+        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
+    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
+    model = phEYE(
+        vision_encoder,
+        lang_encoder,
+        vis_dim=vision_encoder.vision_model.config.hidden_size,
+        cross_attn_every_n_layers=cross_attn_every_n_layers,
+        dtype=dtype,
+        **pheye_kwargs,
+    )
+    # Freeze all parameters
+    model.lang_encoder.requires_grad_(False)
+    assert sum(p.numel() for p in model.lang_encoder.parameters() if p.requires_grad) == 0
+    # Unfreeze perceiver, cross_attn_layers, and LM input embeddings
+    model.lang_encoder.cross_attn_layers.requires_grad_(True)
+    if not freeze_lm_embeddings:
+        model.lang_encoder.get_input_embeddings().requires_grad_(True)
+    print(
+        f"phEYE model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters"
+    )
+    return model, text_tokenizer
+def _infer_decoder_layers_attr_name(model):
+    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
+        if k.lower() in model.__class__.__name__.lower():
+            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
+    raise ValueError(
+        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
+    )
+__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
+    "opt": "model.decoder.layers",
+    "gpt": "transformer.h",
+    "gpt-j": "transformer.h",
+    "pythia": "gpt_neox.layers",
+    "llama": "model.layers",
+    "gptneoxforcausallm": "gpt_neox.layers",
+    "mpt": "transformer.blocks",
+    "mosaicgpt": "transformer.blocks",
+    "phi" : "model.layers"
+}

pheye_builder/phEYE.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import torch
+from peft import LoraConfig, get_peft_model
+from torch import nn
+import os
+class phEYE(nn.Module):
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        lang_encoder: nn.Module,
+        vis_dim: int,
+        dtype: torch.dtype,
+        cross_attn_every_n_layers: int = 1,
+        gradient_checkpointing: bool = False,
+        reduce_factor = 1,
+        from_layer = 0
+    ):
+        """
+        Args:
+            vision_encoder (nn.Module): module with OpenCLIP model
+            lang_encoder (nn.Module): HF causal language model
+            vis_dim (int): Dimension of the visual features.
+                Visual features are projected to match this shape along the last dimension.
+            cross_attn_every_n_layers (int, optional): How often to apply cross attention after transformer layer. Defaults to 1.
+        """
+        super().__init__()
+        self.vis_dim = vis_dim
+        if hasattr(lang_encoder.config, "d_model"):
+            self.lang_dim = lang_encoder.config.d_model  # mpt uses d_model
+        else:
+            self.lang_dim = lang_encoder.config.hidden_size
+        self.vision_encoder = vision_encoder
+        self.lang_encoder = lang_encoder
+        self.lang_encoder.init_pheye(
+            lang_hidden_size=self.lang_dim,
+            vis_hidden_size=self.vis_dim,
+            cross_attn_every_n_layers=cross_attn_every_n_layers,
+            gradient_checkpointing=gradient_checkpointing,
+            reduce_factor=reduce_factor,
+            from_layer=from_layer,
+            dtype=dtype
+        )
+        self._use_gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        vision_x: list,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        labels: torch.Tensor = None,
+        clear_conditioned_layers: bool = True,
+        past_key_values = None,
+        use_cache: bool = False,
+        device="cpu",
+        is_textcaps = False
+    ):
+        """
+        Forward pass of phEYE.
+        Args:
+            vision_x (list): Vision input
+                shape (B, C, H, W)
+            lang_x (torch.Tensor): Language input ids
+                shape (B, txt_seq)
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            labels (torch.Tensor, optional): Labels. Defaults to None.
+            clear_conditioned_layers: if True, clear the conditioned layers
+                once the foward pass is completed. Set this to false if the
+                same set of images will be reused in another subsequent
+                forward pass.
+            past_key_values: pre-computed values to pass to language model.
+                See past_key_values documentation in Hugging Face
+                CausalLM models.
+            use_cache: whether to use cached key values. See use_cache
+                documentation in Hugging Face CausalLM models.
+        """
+        assert (
+            self.lang_encoder.initialized_pheye
+        ), "Wrapper layers are not initialized. Please call `initialized_pheye` first."
+        assert (
+            self.lang_encoder._use_cached_vision_x or vision_x is not None
+        ), "Must provide either vision_x or have precached media using cache_media()."
+        if self.lang_encoder._use_cached_vision_x:
+            # Case: use cached; vision_x should be cached and other
+            # vision-related inputs should not be provided.
+            assert (
+                vision_x is None
+            ), "Expect vision_x to be None when media has been cached using cache_media(). Try uncache_media() first."
+            assert self.lang_encoder.is_conditioned()
+        else:
+            # Case: do not use caching (i.e. this is a standard forward pass);
+            self._encode_vision_x(vision_x=vision_x, device=device, is_textcaps=is_textcaps)
+        #print(f"Text features shape: {lang_x.shape}")
+        output = self.lang_encoder(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        if clear_conditioned_layers:
+            self.lang_encoder.clear_conditioned_layers()
+        return output
+    def generate(
+        self,
+        vision_x: list,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        device = "cpu",
+        **kwargs,
+    ):
+        """
+        Generate text conditioned on vision and language inputs.
+        Args:
+            vision_x (list): Vision input
+                shape (B, C, H, W)
+                images in the same chunk are collated along T_img, and frames are collated along F
+                currently only F=1 is supported (single-frame videos)
+            lang_x (torch.Tensor): Language input
+                shape (B, T_txt)
+            **kwargs: see generate documentation in Hugging Face CausalLM models. Some notable kwargs:
+                max_length (int, optional): Maximum length of the output. Defaults to None.
+                attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+                num_beams (int, optional): Number of beams. Defaults to 1.
+                max_new_tokens (int, optional): Maximum new tokens. Defaults to None.
+                temperature (float, optional): Temperature. Defaults to 1.0.
+                top_k (int, optional): Top k. Defaults to 50.
+                top_p (float, optional): Top p. Defaults to 1.0.
+                no_repeat_ngram_size (int, optional): No repeat ngram size. Defaults to 0.
+                length_penalty (float, optional): Length penalty. Defaults to 1.0.
+                num_return_sequences (int, optional): Number of return sequences. Defaults to 1.
+                do_sample (bool, optional): Do sample. Defaults to False.
+                early_stopping (bool, optional): Early stopping. Defaults to False.
+        Returns:
+            torch.Tensor: lang_x with generated tokens appended to it
+        """
+        num_beams = kwargs.pop("num_beams", 1)
+        self.lang_encoder._use_cached_vision_x = True
+        self._encode_vision_x(vision_x=vision_x, device=device, repeat=num_beams)
+        output = self.lang_encoder.generate(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            num_beams=num_beams,
+            **kwargs,
+        )
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False
+        return output
+    def _encode_vision_x(self, vision_x: list, device="cpu", repeat = 1, is_textcaps = False):
+        """
+        Compute vision features by passing images through vision encoder and conditioning language model.
+        Args:
+            vision_x (list): Vision input
+                shape (B, C, H, W)
+        """
+        if is_textcaps:
+            vision_x = vision_x[::5]
+            repeat = 5
+        vision_x = self.vision_encoder(vision_x, device=device)
+        if repeat > 1:
+            vision_x = vision_x.repeat_interleave(repeat, dim=0)
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_vis_x(vision_x)
+    def cache_media(self, vision_x: list, device="cpu"):
+        """
+        Cache vision_x features from list of images for log-likelihood evaluation
+        This is not meant to be used to cache things for generate().
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, F, C, H, W)
+        """
+        self._encode_vision_x(vision_x=vision_x, device=device)
+        self.lang_encoder._use_cached_vision_x = True
+    def uncache_media(self):
+        """
+        Clear all conditioning.
+        """
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False
+    def save_model(self, _path):
+        os.mkdir(_path)
+        torch.save(self.vision_encoder.state_dict(), _path+"vision_encoder.pt")
+        torch.save(self.lang_encoder.state_dict(), _path+"lang_encoder.pt")
+    def add_lora_decoder(self):
+        config = LoraConfig(
+                r=16,
+                lora_alpha=32,
+                target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"],
+                lora_dropout=0.05,
+                bias="none"
+            )
+        self.lang_encoder.old_decoder_blocks = get_peft_model(self.lang_encoder.old_decoder_blocks, config)
+    def merge_and_unload(self):
+        self.lang_encoder.old_decoder_blocks = self.lang_encoder.old_decoder_blocks.merge_and_unload()

pheye_builder/utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)
+def apply_with_stopping_condition(
+    module, apply_fn, apply_condition=None, stopping_condition=None, **other_args
+):
+    if stopping_condition(module):
+        return
+    if apply_condition(module):
+        apply_fn(module, **other_args)
+    for child in module.children():
+        apply_with_stopping_condition(
+            child,
+            apply_fn,
+            apply_condition=apply_condition,
+            stopping_condition=stopping_condition,
+            **other_args
+        )

pheye_builder/wrapper_lm.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import torch.nn as nn
+from .xattn import CrossAttentionBlock
+from .utils import getattr_recursive, setattr_recursive
+class WrapperLayer(nn.Module):
+    """
+    WrapperLayer is a wrapper around the CrossAttentionBlock and DecoderLayer.
+    """
+    def __init__(
+        self, cross_attn_layer, decoder_layer, gradient_checkpointing=False
+    ):
+        super().__init__()
+        self.cross_attn_layer = cross_attn_layer
+        self.decoder_layer = decoder_layer
+        self.vis_x = None
+        if self.cross_attn_layer is not None:
+            self.cross_attn_layer._use_gradient_checkpointing = (
+                gradient_checkpointing
+            )
+        self.decoder_layer._use_gradient_checkpointing = gradient_checkpointing
+    def is_conditioned(self) -> bool:
+        """Check whether the layer is conditioned."""
+        return self.vis_x is not None
+    # Used this great idea from this implementation of Flamingo (https://github.com/dhansmair/flamingo-mini/)
+    def condition_vis_x(self, vis_x):
+        self.vis_x = vis_x
+    def forward(
+        self,
+        lang_x,
+        attention_mask=None,
+        **decoder_layer_kwargs,
+    ):
+        # Cross attention
+        if self.cross_attn_layer is not None:
+            if self.vis_x is None:
+                raise ValueError("vis_x must be conditioned before forward pass")
+            lang_x = self.cross_attn_layer(
+                lang_x,
+                self.vis_x
+            )
+        # Normal decoder layer
+        lang_x = self.decoder_layer(
+            lang_x, attention_mask=attention_mask, **decoder_layer_kwargs
+        )
+        return lang_x
+class phEYELMMixin(nn.Module):
+    """
+    Mixin to add cross-attention layers to a language model.
+    """
+    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
+        self.decoder_layers_attr_name = decoder_layers_attr_name
+    def _get_decoder_layers(self):
+        return getattr_recursive(self, self.decoder_layers_attr_name)
+    def _set_decoder_layers(self, value):
+        setattr_recursive(self, self.decoder_layers_attr_name, value)
+    def init_pheye(
+        self,
+        lang_hidden_size,
+        vis_hidden_size,
+        dtype,
+        cross_attn_every_n_layers,
+        gradient_checkpointing,
+        reduce_factor=1,
+        from_layer=0
+    ):
+        """
+        Initialize phEYE by adding a new cross attn to the decoder.
+        """
+        self.old_decoder_blocks = self._get_decoder_layers()
+        self.cross_attn_layers = nn.ModuleList(
+            [
+                CrossAttentionBlock(
+                    dim_text=lang_hidden_size, dim_visual=vis_hidden_size, reduce_factor=reduce_factor, layer_idx=layer_idx, n_decoder_layers=len(self.old_decoder_blocks), dtype=dtype
+                )
+                if (layer_idx + 1) % cross_attn_every_n_layers == 0 and layer_idx >= from_layer
+                else None
+                for layer_idx, _ in enumerate(self._get_decoder_layers())
+            ]
+        )
+        self.init_pheye_layers(gradient_checkpointing)
+        self.initialized_pheye = True
+        self._use_cached_vision_x = False
+    def init_pheye_layers(self, gradient_checkpointing):
+        """
+        Re initializes the WrapperLayers.
+        Propagates any changes made to self.cross_attn_layers or self.old_decoder_blocks
+        """
+        self._set_decoder_layers(
+            nn.ModuleList(
+                [
+                    WrapperLayer(
+                        cross_attn_layer, decoder_layer, gradient_checkpointing
+                    )
+                    for cross_attn_layer, decoder_layer in zip(
+                        self.cross_attn_layers, self.old_decoder_blocks
+                    )
+                ]
+            )
+        )
+    def forward(self, input_ids, attention_mask, **kwargs):
+        if not self.initialized_pheye:
+            raise ValueError(
+                "phEYE layers are not initialized. Please call `init_pheye` first."
+            )
+        kwargs["input_ids"] = input_ids
+        kwargs["attention_mask"] = attention_mask
+        return super().forward(**kwargs)  # Call the other parent's forward method
+    def is_conditioned(self) -> bool:
+        """Check whether all decoder layers are already conditioned."""
+        return all(l.is_conditioned() for l in self._get_decoder_layers())
+    def clear_conditioned_layers(self):
+        for layer in self._get_decoder_layers():
+            layer.condition_vis_x(None)

pheye_builder/xattn.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+Based on: https://github.com/lucidrains/flamingo-pytorch
+"""
+from einops import rearrange
+from einops_exts import rearrange_many
+from torch import einsum, nn
+import math
+def exists(val):
+    return val is not None
+class FeedForward(nn.Module):
+    def __init__(self, dim, dtype, reduce_factor = 1):
+        super().__init__()
+        mult = 4
+        self.norm = nn.LayerNorm(dim, dtype=dtype)
+        inner_dim = int(dim * mult) // reduce_factor
+        self.fc1 = nn.Linear(dim, inner_dim, dtype=dtype)
+        self.fc2 = nn.Linear(inner_dim, dim, dtype=dtype)
+        self.act = nn.GELU()
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+# cross attention
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim_text,
+        dim_visual,
+        dtype,
+        dim_head=64,
+        reduce_factor=1
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        max_dim = max(dim_text, dim_visual)
+        self.heads = max_dim // dim_head
+        assert max_dim % dim_head == 0, f"Number of heads in CrossAttention is not an int - {self.heads}"
+        inner_dim = max_dim // reduce_factor
+        self.norm = nn.LayerNorm(dim_text, dtype=dtype)
+        self.to_q = nn.Linear(dim_text, inner_dim, dtype=dtype)
+        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, dtype=dtype)
+        #self.to_kv_second = nn.Linear(dim_visual, inner_dim * 2)
+        self.to_out = nn.Linear(inner_dim, dim_text, dtype=dtype)
+        #self.g = []
+        #self.l = []
+    def forward(self, x, media):
+        """
+        Args:
+            x (torch.Tensor): text features
+                shape (B, txt_seq, D_txt)
+            media (torch.Tensor): image features
+                shape (B, img_seq, D_img) where img_seq is the number of concatenated features from the ViT. For example:
+                for an encoder of 224x224 with patch size 14 and processing images of 896x896 (with 3 levels) it will be (1 + 4 + 16) * 257 = 5397
+        """
+        h = self.heads
+        x = self.norm(x)
+        q = self.to_q(x)
+        k, v = self.to_kv(media).chunk(2, dim=-1)
+        """k_s, v_s = self.to_kv(media[:, 257:, :]).chunk(2, dim=-1)
+        k = torch.cat((k, k_s), 1)
+        v = torch.cat((v, v_s), 1)"""
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
+        q = q * self.scale
+        sim = einsum("... i d, ... j d -> ... i j", q, k)
+        attn = sim.softmax(dim=-1)
+        #idk = torch.mean(attn.squeeze()[:, 65:, :], (0, 1))
+        #self.g.append(torch.sum(idk[:257]).item())
+        #self.l.append(torch.sum(idk[257:]).item())
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+# cross attention
+class CrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim_text,
+        dim_visual,
+        dtype,
+        dim_head=64,
+        reduce_factor = 1,
+        layer_idx=0,
+        n_decoder_layers = 24
+    ):
+        super().__init__()
+        self.attn = CrossAttention(
+            dim_text=dim_text,
+            dim_visual=dim_visual,
+            dim_head=dim_head,
+            reduce_factor=reduce_factor,
+            dtype=dtype
+        )
+        self.ff = FeedForward(dim_text, reduce_factor=reduce_factor, dtype=dtype)
+        self.layer_idx = layer_idx
+        self.n_decoder_layers = n_decoder_layers
+        self.apply(self._init_weights)
+    def forward(
+        self,
+        x,
+        media
+    ):
+        x = (
+            self.attn(
+                x,
+                media
+            )
+            + x
+        )
+        x = self.ff(x) + x
+        return x
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=0.01)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        for name, p in module.named_parameters():
+            if name == "fc2.weight" or name == "to_out.weight":
+                p.data.normal_(mean=0.0, std=(0.01 / math.sqrt(2 * max(self.n_decoder_layers, 36))))

requirements.txt CHANGED Viewed

	@@ -1 +1,8 @@
1	- huggingface_hub==0.22.2

+huggingface_hub==0.22.2
+transformers==4.37.0
+pillow==10.3.0
+torch==2.1.1
+torchvision==0.16.1
+peft==0.7.0
+einops==0.6.1
+einops-exts==0.0.4