Spaces:

chendl
/

compositional_test

Runtime error

App Files Files Community

chendl commited on Nov 5, 2023

Commit

e770d90

•

1 Parent(s): e9a2cd8

update cap

Browse files

Files changed (50) hide show

app.py +3 -2
multimodal/build/lib/open_flamingo/__init__.py +2 -0
multimodal/build/lib/open_flamingo/chat/__init__.py +0 -0
multimodal/build/lib/open_flamingo/chat/conversation.py +571 -0
multimodal/build/lib/open_flamingo/eval/__init__.py +1 -0
multimodal/build/lib/open_flamingo/eval/classification.py +147 -0
multimodal/build/lib/open_flamingo/eval/coco_metric.py +23 -0
multimodal/build/lib/open_flamingo/eval/dataset_zoo/__init__.py +33 -0
multimodal/build/lib/open_flamingo/eval/dataset_zoo/aro_datasets.py +365 -0
multimodal/build/lib/open_flamingo/eval/dataset_zoo/constants.py +3 -0
multimodal/build/lib/open_flamingo/eval/dataset_zoo/perturbations.py +194 -0
multimodal/build/lib/open_flamingo/eval/dataset_zoo/retrieval.py +266 -0
multimodal/build/lib/open_flamingo/eval/dataset_zoo/utils.py +15 -0
multimodal/build/lib/open_flamingo/eval/eval_datasets.py +101 -0
multimodal/build/lib/open_flamingo/eval/evaluate.py +1435 -0
multimodal/build/lib/open_flamingo/eval/evaluate_debug.py +1159 -0
multimodal/build/lib/open_flamingo/eval/evaluate_find_showcase.py +1700 -0
multimodal/build/lib/open_flamingo/eval/evaluate_temp.py +1838 -0
multimodal/build/lib/open_flamingo/eval/imagenet_utils.py +1007 -0
multimodal/build/lib/open_flamingo/eval/ok_vqa_utils.py +213 -0
multimodal/build/lib/open_flamingo/eval/task/__init__.py +0 -0
multimodal/build/lib/open_flamingo/eval/task/caption.py +419 -0
multimodal/build/lib/open_flamingo/eval/task/caption_chat.py +417 -0
multimodal/build/lib/open_flamingo/eval/task/cola.py +220 -0
multimodal/build/lib/open_flamingo/eval/task/crepe.py +93 -0
multimodal/build/lib/open_flamingo/eval/task/gqa.py +248 -0
multimodal/build/lib/open_flamingo/eval/task/mmbench.py +84 -0
multimodal/build/lib/open_flamingo/eval/task/reg.py +141 -0
multimodal/build/lib/open_flamingo/eval/task/utils.py +287 -0
multimodal/build/lib/open_flamingo/eval/task/vl_checklist.py +113 -0
multimodal/build/lib/open_flamingo/eval/vqa_metric.py +594 -0
multimodal/build/lib/open_flamingo/src/__init__.py +0 -0
multimodal/build/lib/open_flamingo/src/attention.py +45 -0
multimodal/build/lib/open_flamingo/src/factory.py +269 -0
multimodal/build/lib/open_flamingo/src/flamingo.py +637 -0
multimodal/build/lib/open_flamingo/src/flamingo_lm.py +173 -0
multimodal/build/lib/open_flamingo/src/gcn.py +137 -0
multimodal/build/lib/open_flamingo/src/helpers.py +263 -0
multimodal/build/lib/open_flamingo/src/utils.py +31 -0
multimodal/build/lib/open_flamingo/train/__init__.py +1 -0
multimodal/build/lib/open_flamingo/train/data2.py +868 -0
multimodal/build/lib/open_flamingo/train/distributed.py +128 -0
multimodal/build/lib/open_flamingo/train/instruction_template.py +13 -0
multimodal/build/lib/open_flamingo/train/train.py +709 -0
multimodal/build/lib/open_flamingo/train/train_utils.py +387 -0
multimodal/open_flamingo.egg-info/PKG-INFO +247 -0
multimodal/open_flamingo.egg-info/SOURCES.txt +53 -0
multimodal/open_flamingo.egg-info/dependency_links.txt +1 -0
multimodal/open_flamingo.egg-info/requires.txt +17 -0
multimodal/open_flamingo.egg-info/top_level.txt +1 -0

app.py CHANGED Viewed

@@ -53,7 +53,8 @@ flamingo, image_processor, tokenizer, vis_embed_size = create_model_and_transfor
 )
-checkpoint_path = hf_hub_download("chendl/compositional_test", "pythiaS.pt")
 checkpoint = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
 model_state_dict = {}
 for key in checkpoint.keys():
@@ -326,7 +327,7 @@ with gr.Blocks() as demo:
     clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list],
                 queue=False)
-demo.launch(enable_queue=True)
 #
 # with gr.Blocks() as demo:
 #     gr.Markdown(

 )
+checkpoint_path = "/home/aimos/huggingface/space/demo.pt"
+# hf_hub_download("chendl/compositional_test", "pythiaS.pt")
 checkpoint = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
 model_state_dict = {}
 for key in checkpoint.keys():
     clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list],
                 queue=False)
+demo.launch(enable_queue=True,share=True)
 #
 # with gr.Blocks() as demo:
 #     gr.Markdown(

multimodal/build/lib/open_flamingo/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .src.flamingo import Flamingo
2	+ from .src.factory import create_model_and_transforms

multimodal/build/lib/open_flamingo/chat/__init__.py ADDED Viewed

File without changes

multimodal/build/lib/open_flamingo/chat/conversation.py ADDED Viewed

	@@ -0,0 +1,571 @@

+import argparse
+import time
+import re
+from PIL import Image
+import torch
+import numpy as np
+import transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
+from transformers import StoppingCriteria, StoppingCriteriaList
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple, Any
+import string
+import cv2
+import gradio as gr
+from huggingface_hub import hf_hub_download, login
+from open_flamingo.src.factory import create_model_and_transforms
+from open_flamingo.eval.task.caption_chat import captioner
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    # system_img: List[Image.Image] = []
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    skip_next: bool = False
+    conv_id: Any = None
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            # system_img=self.system_img,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            conv_id=self.conv_id)
+    def dict(self):
+        return {
+            "system": self.system,
+            # "system_img": self.system_img,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+            "conv_id": self.conv_id,
+        }
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = stops
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        for stop in self.stops:
+            if torch.all((stop == input_ids[0][-len(stop):])).item():
+                return True
+        return False
+CONV_VISION = Conversation(
+    system="Give the following image: <Img>ImageContent</Img>. "
+           "You will be able to see the image once I provide it to you. Please answer my questions.",
+    roles=("Human", "Assistant"),
+    messages=[],
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+def get_outputs(
+    model,
+    batch_images,
+    attention_mask,
+    max_generation_length,
+    min_generation_length,
+    num_beams,
+    length_penalty,
+    input_ids,
+    image_start_index_list=None,
+    image_nums=None,
+    bad_words_ids=None,
+):
+    #  and torch.cuda.amp.autocast(dtype=torch.float16)
+    with torch.inference_mode():
+        outputs = model(
+            vision_x=batch_images,
+            lang_x=input_ids,
+            attention_mask=attention_mask,
+            labels=None,
+            image_nums=image_nums,
+            image_start_index_list=image_start_index_list,
+            added_bbox_list=None,
+            add_box=False,
+        )
+        # outputs = model.generate(
+        #     batch_images,
+        #     input_ids,
+        #     attention_mask=attention_mask,
+        #     max_new_tokens=max_generation_length,
+        #     min_length=min_generation_length,
+        #     num_beams=num_beams,
+        #     length_penalty=length_penalty,
+        #     image_start_index_list=image_start_index_list,
+        #     image_nums=image_nums,
+        #     bad_words_ids=bad_words_ids,
+        # )
+    return outputs
+def generate(
+    idx,
+    image,
+    text,
+    image_processor,
+    tokenizer,
+    flamingo,
+    vis_embed_size=256,
+    rank=0,
+    world_size=1,
+):
+    if image is None:
+        raise gr.Error("Please upload an image.")
+    flamingo.eval()
+    loc_token_ids = []
+    for i in range(1000):
+        loc_token_ids.append(int(tokenizer(f"<loc_{i}>", add_special_tokens=False)["input_ids"][-1]))
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+    bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    image_ori = image
+    image = image.convert("RGB")
+    width = image.width
+    height = image.height
+    image = image.resize((224, 224))
+    batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+    if idx == 1:
+        prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token * vis_embed_size}<|#endofimage#|><|#object#|> {text.rstrip('.').strip()}<|#endofobject#|><|#visual#|>"]
+        bad_words_ids = None
+        max_generation_length = 5
+    else:
+        prompt = [f"<|#image#|>{tokenizer.pad_token * vis_embed_size}<|#endofimage#|>{text.rstrip('.')}"]
+        bad_words_ids = loc_word_ids
+        max_generation_length = 300
+    encodings = tokenizer(
+        prompt,
+        padding="longest",
+        truncation=True,
+        return_tensors="pt",
+        max_length=2000,
+    )
+    input_ids = encodings["input_ids"]
+    attention_mask = encodings["attention_mask"]
+    image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+    image_start_index_list = [[x] for x in image_start_index_list]
+    image_nums = [1] * len(input_ids)
+    outputs = get_outputs(
+        model=flamingo,
+        batch_images=batch_images,
+        attention_mask=attention_mask,
+        max_generation_length=max_generation_length,
+        min_generation_length=4,
+        num_beams=1,
+        length_penalty=1.0,
+        input_ids=input_ids,
+        bad_words_ids=bad_words_ids,
+        image_start_index_list=image_start_index_list,
+        image_nums=image_nums,
+    )
+    boxes = outputs["boxes"]
+    scores = outputs["scores"]
+    if len(scores) > 0:
+        box = boxes[scores.argmax()]/224
+    print(f"{box}")
+    if len(boxes)>0:
+        open_cv_image = np.array(image_ori)
+        # Convert RGB to BGR
+        open_cv_image = open_cv_image[:, :, ::-1].copy()
+        box = box*[width,height,width,height]
+        # for box in boxes:
+        open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2)
+        out_image = Image.fromarray(cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB))
+        return f"Output:{box}", out_image
+    else:
+        gen_text = tokenizer.batch_decode(outputs)
+        return (f"{gen_text}")
+def preprocess_conv(data):
+    conversation = ""
+    BEGIN_SIGNAL = "### "
+    END_SIGNAL = "\n"
+    for idx, d in enumerate(data):
+        from_str = d["from"]
+        if from_str.lower() == "human":
+            from_str = "Human"
+        elif from_str.lower() == "gpt":
+            from_str = "Assistant"
+        else:
+            from_str = 'unknown'
+        conversation += (BEGIN_SIGNAL + from_str + ": " + d["value"] + END_SIGNAL)
+    return conversation
+def preprocess_image(sample, image_processor):
+    image = image_processor(sample)
+    if isinstance(image, transformers.image_processing_utils.BatchFeature):
+        image = torch.tensor(image["pixel_values"][0])
+    return image
+class Chat:
+    def __init__(self, model, vis_processor, tokenizer, vis_embed_size ):
+        self.model = model
+        self.vis_processor = vis_processor
+        self.tokenizer = tokenizer
+        self.vis_embed_size = vis_embed_size
+        self.conv = []
+        # stop_words_ids = [torch.tensor([835]).to(self.device),
+        #                   torch.tensor([2277, 29937]).to(self.device)]  # '###' can be encoded in two different ways.
+        # self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+    def ask(self, text, conv,radio):
+        if radio in ["Cap"]:
+            conv.append({
+                "from": "human",
+                "value": "",
+            })
+        elif radio in ["VQA"]:
+            conv.append({
+                "from": "human",
+                "value": f"Answer the question using a single word or phrase. {text}",
+            })
+        elif radio in ["REC"]:
+            conv.append({
+                "from": "human",
+                "value": f"Please provide the bounding box coordinate of the region this sentence describes: {text}.",
+            })
+        else:
+            conv.append({
+                "from": "human",
+                "value": text,
+            })
+        # if len(conv.messages) > 0 and conv.messages[-1][0] == conv.roles[0] \
+        #         and conv.messages[-1][1][-6:] == '</Img>':  # last message is image.
+        #     conv.messages[-1][1] = ' '.join([conv.messages[-1][1], text])
+        # else:
+        #     conv.append_message(conv.roles[0], text)
+    def answer(self, conv, img_list, radio, text_input, max_new_tokens=200, num_beams=5, min_length=1, top_p=0.9,
+               repetition_penalty=1.0, length_penalty=1, temperature=1, max_length=2000):
+        # conv.append_message(conv.roles[1], None)
+        # embs = self.get_context_emb(conv, img_list)
+        #
+        # # current_max_len = embs.shape[1] + max_new_tokens + 100
+        # # begin_idx = max(0, current_max_len - max_length)
+        # # embs = embs[:, begin_idx:]
+        # outputs = self.model.llama_model.generate(
+        #     inputs_embeds=embs,
+        #     max_new_tokens=max_new_tokens,
+        #     stopping_criteria=self.stopping_criteria,
+        #     num_beams=num_beams,
+        #     min_length=min_length,
+        #     top_p=top_p,
+        #     repetition_penalty=repetition_penalty,
+        #     length_penalty=length_penalty,
+        #     temperature=temperature,
+        # )
+        # output_token = outputs[0]
+        # if output_token[0] == 0:
+        #     output_token = output_token[1:]
+        # output_text = self.model.llama_tokenizer.decode(output_token, add_special_tokens=False)
+        # output_text = output_text.split('###')[0]  # remove the stop sign '###'
+        # output_text = output_text.split('Assistant:')[-1].strip()
+        # conv.messages[-1][1] = output_text
+        visual_token = "<|#visual#|>"
+        previsual_token = "<|#previsual#|>"
+        box_token = "<|#box#|>"
+        prebox_token = "<|#prebox#|>"
+        end_token = "<|#endofobject#|>"
+        object_token = "<|#object#|>"
+        end_of_attr_token = "<|#endofattr#|>"
+        preend_of_attr_token = "<|#preendofattr#|>"
+        media_token_id = self.tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+        box_token_id = self.tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+        endofobject_token_id = self.tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+        endofattr_token_id = self.tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+        endofmedia_token_id = self.tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+        visual_token_id = self.tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+        previsual_token_id = self.tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+        prebox_token_id = self.tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+        size = 224
+        self.model.eval()
+        # "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/cdl/tmp_img/chat_vis/chat19.png"
+        # image_path = input("Please enter the image path: ")
+        image = img_list[0].convert("RGB")
+        image_ori = image
+        image = image.resize((size, size))
+        print(f"image size: {image.size}")
+        batch_images = preprocess_image(image, self.vis_processor).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        # conversation = []
+        human_sentence = None
+        if radio in ["Cap","VQA"]:
+            conv.append({
+                "from": "gpt",
+                "value": "",
+            })
+        elif radio in ["REC"]:
+            conv.append(
+                {
+                    "from": "gpt",
+                    "value": object_token + text_input + end_token + visual_token,
+                }
+            )
+        else:
+            conv.append({
+                        "from": "gpt",
+                        "value": "",
+                    })
+        # while True:
+        #     human_sentence = input("### Human: ")
+        #     if human_sentence == "#end#":
+        #         break
+        #     conversation.append({
+        #         "from": "human",
+        #         "value": human_sentence,
+        #     })
+        #     conversation.append({
+        #         "from": "gpt",
+        #         "value": "",
+        #     })
+        text = preprocess_conv(conv).strip()
+        caption = f"<|#image#|>{self.tokenizer.pad_token * self.vis_embed_size}<|#endofimage#|>{text}"
+        encodings = self.tokenizer(
+            caption,
+            padding="longest",
+            truncation=True,
+            return_tensors="pt",
+            max_length=2000,
+        )
+        input_ids = encodings["input_ids"]
+        attention_mask = encodings["attention_mask"]
+        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+        image_start_index_list = [[x] for x in image_start_index_list]
+        image_nums = [1] * len(input_ids)
+        added_bbox_list = []
+        if radio in ["Cap"]:
+            output_text, out_image = captioner(self.model,self.tokenizer,image_ori,batch_images,input_ids,attention_mask,image_start_index_list,image_nums,added_bbox_list)
+        else:
+            with torch.inference_mode():
+                text_outputs = self.model.generate(
+                    batch_images,
+                    input_ids,
+                    attention_mask=attention_mask,
+                    max_new_tokens=20,
+                    # min_new_tokens=8,
+                    num_beams=1,
+                    # length_penalty=0,
+                    image_start_index_list=image_start_index_list,
+                    image_nums=image_nums,
+                    added_bbox_list=added_bbox_list if len(added_bbox_list) != 0 else None,
+                )
+            # and torch.cuda.amp.autocast(dtype=torch.float16)
+            with torch.no_grad():
+                outputs = self.model(
+                    vision_x=batch_images,
+                    lang_x=input_ids,
+                    attention_mask=attention_mask,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=None,
+                    add_box=False,
+                )
+            boxes = outputs["boxes"]
+            scores = outputs["scores"]
+            if len(scores) > 0:
+                box = boxes[scores.argmax()] / 224
+            print(f"{box}")
+            out_image = None
+            if len(boxes)>0:
+                width, height = image_ori.size
+                open_cv_image = np.array(image_ori)
+                # Convert RGB to BGR
+                open_cv_image = open_cv_image[:, :, ::-1].copy()
+                box = box * [width, height, width, height]
+                # for box in boxes:
+                open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2)
+                out_image = Image.fromarray(cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB))
+            # output_token = outputs[0, input_ids.shape[1]:]
+            # output_text = tokenizer.decode(output_token, skip_special_tokens=True).strip()
+            # conv[-1]["value"] = output_text
+            # # conv.messages[-1][1] = output_text
+            # print(
+            #     f"### Assistant: {tokenizer.decode(outputs[0, input_ids.shape[1]:], skip_special_tokens=True).strip()}")
+            output_text = self.tokenizer.decode(text_outputs[0])
+            print(output_text)
+            output_text = re.findall(r'Assistant:(.+)', output_text)[-1]
+            print(output_text)
+        return output_text, out_image
+    def upload_img(self, image, conv, img_list):
+        img_list.append(image)
+        # if isinstance(image, str):  # is a image path
+        #     raw_image = Image.open(image).convert('RGB')
+        #     image = image.resize((224, 224))
+        #     image = self.vis_processor(raw_image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        # elif isinstance(image, Image.Image):
+        #     raw_image = image
+        #     image = image.resize((224, 224))
+        #     image = self.vis_processor(raw_image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        # elif isinstance(image, torch.Tensor):
+        #     if len(image.shape) == 3:
+        #         image = image.unsqueeze(0)
+        #     # image = image.to(self.device)
+        #
+        # # image_emb, _ = self.model.encode_img(image)
+        # img_list.append(image_emb)
+        # conv.append_message(conv.roles[0], "<Img><ImageHere></Img>")
+        msg = "Received."
+        # self.conv.append_message(self.conv.roles[1], msg)
+        return msg
+    # def get_context_emb(self, conv, img_list):
+    #     prompt = conv.get_prompt()
+    #     prompt_segs = prompt.split('<ImageHere>')
+    #     assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images."
+    #     seg_tokens = [
+    #         self.model.llama_tokenizer(
+    #             seg, return_tensors="pt", add_special_tokens=i == 0).to(self.device).input_ids
+    #         # only add bos to the first seg
+    #         for i, seg in enumerate(prompt_segs)
+    #     ]
+    #     seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
+    #     mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
+    #     mixed_embs = torch.cat(mixed_embs, dim=1)
+    #     return mixed_embs
+def evaluate_exp(
+    model,
+    tokenizer,
+    image_processor,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    add_visual=True,
+):
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+    endofattr_token_id = tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    size = image_processor.size["shortest_edge"]
+    model.eval()
+    # "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/cdl/tmp_img/chat_vis/chat19.png"
+    image_path = input("Please enter the image path: ")
+    image = Image.open(image_path).convert("RGB")
+    image = image.resize((size, size))
+    print(f"image size: {image.size}")
+    batch_images = preprocess_image(image, image_processor).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+    conversation = []
+    human_sentence = None
+    while True:
+        human_sentence = input("### Human: ")
+        if human_sentence == "#end#":
+            break
+        conversation.append({
+            "from": "human",
+            "value": human_sentence,
+        })
+        conversation.append({
+            "from": "gpt",
+            "value": "",
+        })
+        text = preprocess_conv(conversation).strip()
+        caption = f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text}"
+        encodings = tokenizer(
+            caption,
+            padding="longest",
+            truncation=True,
+            return_tensors="pt",
+            max_length=2000,
+        )
+        input_ids = encodings["input_ids"].to("cuda")
+        attention_mask = encodings["attention_mask"].to("cuda")
+        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+        image_start_index_list = [[x] for x in image_start_index_list]
+        image_nums = [1] * len(input_ids)
+        with torch.no_grad() and torch.cuda.amp.autocast(dtype=torch.float16):
+            outputs = model.generate(
+                batch_images,
+                input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=100,
+                # min_new_tokens=8,
+                num_beams=1,
+                image_start_index_list=image_start_index_list,
+                image_nums=image_nums,
+            )
+        print(f"### Assistant: {tokenizer.decode(outputs[0, input_ids.shape[1]:], skip_special_tokens=True).strip()}")

multimodal/build/lib/open_flamingo/eval/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

multimodal/build/lib/open_flamingo/eval/classification.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from typing import Dict, Sequence, Tuple
+import re
+import numpy as np
+import torch
+def postprocess_classification_generation(predictions) -> str:
+    return re.split("Prompt|Completion", predictions, 1)[0]
+def compute_classification_accuracy(predictions: Sequence[Dict[str, str]]) -> float:
+    """Compute the accuracy of a sequence of predictions."""
+    def _preprocess_fn(s):
+        """Function to preprocess both targets and predictions."""
+        return s.lower()
+    is_correct = [
+        _preprocess_fn(x["prediction"]) == _preprocess_fn(x["class_label"])
+        for x in predictions
+    ]
+    return np.mean(is_correct).item()
+def compute_shifted_logits_and_labels(
+    logits: torch.Tensor, encodings, tokenizer, eoc_token_id
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Helper function to compute shifted logits and labels.
+    This allows for straightforward computation of the loss on shift_logits
+    and shift_labels such that the nth element of logits computes the n-1th
+    element of the original labels (in the outputs, the nth element of logits
+    corresponds to the nth element of the labels).
+    Elements in shift_labels that correspond to inputs are masked with values
+    of -100 (by default in hf, loss is only computed on token IDs >= 0).
+    Returns: tuple containing two elements:
+        shift_logits: a float Tensor of shape [batch_size, seq_len - 1].
+        shift_labels: an integer Tensor of shape [batch_size, seq_len - 1]
+    """
+    labels = encodings["input_ids"].clone()
+    # convert padding and EOC tokens to -100 so they are ignored in loss
+    labels[labels == tokenizer.pad_token_id] = -100
+    labels[labels == eoc_token_id] = -100
+    # Convert all tokens in prefix until separator to -100 so they are
+    # ignored in loss
+    for idx in range(len(labels)):
+        # Find the location of the last token of prefix *from right*,
+        # since the first non-padding token of the sequence will also be
+        # eos_token (because bos_token and eos_token are the same for
+        # the tokenizer).
+        end_of_prefix = -labels[idx].tolist()[::-1].index(tokenizer.eos_token_id) - 1
+        labels[idx, : end_of_prefix + 1] = -100
+    # Shift so that tokens < n predict n. The shifted tensors both have
+    # shape [batch_size, seq_len - 1].
+    shift_logits = logits[..., :-1, :].contiguous()
+    shift_labels = labels[..., 1:].contiguous()
+    return shift_logits, shift_labels
+def compute_per_sample_probs(
+    encodings, tokenizer, logits: torch.Tensor, eoc_token_id
+) -> torch.Tensor:
+    """Helper function to compute per-sample probability of the input sequence.
+    Assumes <eos token> is used to separate inputs from targets in the
+    prompt text
+    """
+    shift_logits, shift_labels = compute_shifted_logits_and_labels(
+        logits, encodings, tokenizer, eoc_token_id
+    )
+    # Tuple of tensors for unmasked label tokens. The first element of the
+    # tuple contains the batch indices; the second element contains the
+    # sequence indices.
+    unmasked_indices = torch.nonzero(shift_labels != -100, as_tuple=True)
+    # Tensor where the i^th element is the token_id corresponding to the i^th
+    # element of unmasked_indices
+    unmasked_token_ids = shift_labels[unmasked_indices]
+    # 3d tensor of [batch_idx, sequence_position, token_id] for unmasked tokens.
+    target_idxs = torch.column_stack([*unmasked_indices, unmasked_token_ids])
+    target_idxs = target_idxs.to(shift_logits.device)
+    # Sanity check that every element in batch has at least one unmasked
+    # target token
+    assert torch.all(
+        torch.bincount(target_idxs[:, 0]) != 0
+    ), "At least one element in batch has no unmasked target tokens."
+    # Renormalize over tokens to make sure they are proper probabilities via
+    # softmax over the token dimension.
+    shift_probs = torch.nn.functional.softmax(shift_logits, 2)
+    # Compute the probability of the target sequence (as the product of the
+    # probability of the individual tokens in the sequence).
+    target_probs = torch.ones(len(shift_labels), device=shift_logits.device)
+    for i, j, k in target_idxs:
+        target_probs[i] *= shift_probs[i, j, k]
+    return target_probs
+def compute_per_sample_loss(encodings, tokenizer, logits, eoc_token_id) -> torch.Tensor:
+    """Helper function to compute per-sample classification loss.
+    Assumes <eos token> is used to separate inputs from targets in the
+    prompt text
+    """
+    shift_logits, shift_labels = compute_shifted_logits_and_labels(
+        logits, encodings, tokenizer, eoc_token_id
+    )
+    device = shift_logits.device
+    # Loss is computed token-wise, on Tensors of shape
+    # [batch_size * (seq_len - 1), vocab_size]
+    # and returns a loss tensor of shape
+    # [batch_size * (seq_len - 1)]. Most of the tokens will be masked
+    # in this computation.
+    loss = torch.nn.functional.cross_entropy(
+        shift_logits.view(-1, shift_logits.size(-1)),
+        shift_labels.view(-1).to(device),
+        reduction="none",
+    )
+    # Reshape to [batch_size, seq_len - 1]
+    loss = loss.view(shift_logits.size(0), shift_logits.size(1)).cpu()
+    # loss_mask is 1 for tokens we want included in the loss, and 0 for tokens
+    # that should be ignored in the loss.
+    loss_mask = (shift_labels != -100).int().cpu()
+    loss *= loss_mask
+    # Compute per-element loss : sum loss over all (unmasked) tokens and
+    # divide by number of variable tokens to obtain tensor of
+    # shape [batch_size,]
+    loss = loss.sum(dim=1) / (shift_labels != -100).sum(dim=1).float()
+    return loss

multimodal/build/lib/open_flamingo/eval/coco_metric.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from pycocoevalcap.eval import COCOEvalCap
+from pycocotools.coco import COCO
+import json
+def compute_cider(
+    result_path,
+    annotations_path,
+):
+    # create coco object and coco_result object
+    coco = COCO(annotations_path)
+    coco_result = coco.loadRes(result_path)
+    # create coco_eval object by taking coco and coco_result
+    coco_eval = COCOEvalCap(coco, coco_result)
+    coco_eval.params["image_id"] = coco_result.getImgIds()
+    coco_eval.evaluate()
+    return coco_eval.eval
+def postprocess_captioning_generation(predictions):
+    return predictions

multimodal/build/lib/open_flamingo/eval/dataset_zoo/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from .aro_datasets import VG_Relation, VG_Attribution, COCO_Order, Flickr30k_Order
+from .retrieval import COCO_Retrieval, Flickr30k_Retrieval
+def get_dataset(dataset_name, image_preprocess=None, text_perturb_fn=None, image_perturb_fn=None, download=False, *args, **kwargs):
+    """
+    Helper function that returns a dataset object with an evaluation function.
+    dataset_name: Name of the dataset.
+    image_preprocess: Preprocessing function for images.
+    text_perturb_fn: A function that takes in a string and returns a string. This is for perturbation experiments.
+    image_perturb_fn: A function that takes in a PIL image and returns a PIL image. This is for perturbation experiments.
+    download: Whether to allow downloading images if they are not found.
+    """
+    if dataset_name == "VG_Relation":
+        from .aro_datasets import get_visual_genome_relation
+        return get_visual_genome_relation(image_preprocess=image_preprocess, text_perturb_fn=text_perturb_fn, image_perturb_fn=image_perturb_fn, download=download, *args, **kwargs)
+    elif dataset_name == "VG_Attribution":
+        from .aro_datasets import get_visual_genome_attribution
+        return get_visual_genome_attribution(image_preprocess=image_preprocess, text_perturb_fn=text_perturb_fn, image_perturb_fn=image_perturb_fn, download=download, *args, **kwargs)
+    elif dataset_name == "COCO_Order":
+        from .aro_datasets import get_coco_order
+        return get_coco_order(image_preprocess=image_preprocess, text_perturb_fn=text_perturb_fn, image_perturb_fn=image_perturb_fn, download=download, *args, **kwargs)
+    elif dataset_name == "Flickr30k_Order":
+        from .aro_datasets import get_flickr30k_order
+        return get_flickr30k_order(image_preprocess=image_preprocess, text_perturb_fn=text_perturb_fn, image_perturb_fn=image_perturb_fn, download=download, *args, **kwargs)
+    elif dataset_name == "COCO_Retrieval":
+        from .retrieval import get_coco_retrieval
+        return get_coco_retrieval(image_preprocess=image_preprocess, text_perturb_fn=text_perturb_fn, image_perturb_fn=image_perturb_fn, download=download, *args, **kwargs)
+    elif dataset_name == "Flickr30k_Retrieval":
+        from .retrieval import get_flickr30k_retrieval
+        return get_flickr30k_retrieval(image_preprocess=image_preprocess, text_perturb_fn=text_perturb_fn, image_perturb_fn=image_perturb_fn, download=download, *args, **kwargs)
+    else:
+        raise ValueError(f"Unknown dataset {dataset_name}")

multimodal/build/lib/open_flamingo/eval/dataset_zoo/aro_datasets.py ADDED Viewed

	@@ -0,0 +1,365 @@

+import os
+import json
+import subprocess
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from torch.utils.data import Dataset
+from easydict import EasyDict as edict
+from torchvision.datasets.utils import download_url
+from .perturbations import TextShuffler
+from .constants import ARO_ROOT, COCO_ROOT, FLICKR_ROOT
+from .retrieval import pre_caption
+class VG_Relation(Dataset):
+    def __init__(self, image_preprocess, text_perturb_fn=None, image_perturb_fn=None, root_dir=ARO_ROOT, download=False):
+        '''
+        image_preprocess: a function that takes in a PIL image and returns a tensor.
+        text_perturb_fn: Not used for this dataset. Just for compatibility with other datasets.
+        image_perturb_fn: Not used for this dataset. Just for compatibility with other datasets.
+        root_dir: Directory for the VG-R dataset.
+        download: Whether to download the dataset if it does not exist.
+        '''
+        self.root_dir = root_dir
+        annotation_file = os.path.join(root_dir, "visual_genome_relation.json")
+        image_dir = os.path.join(root_dir, "images")
+        if not os.path.exists(image_dir):
+            print("Image Directory for VG_Relation could not be found!")
+            if download:
+                self.download()
+            else:
+                raise RuntimeError("Please either download the dataset by letting `--download` or specify the correct directory.")
+        if not os.path.exists(annotation_file):
+            subprocess.call(["gdown", "--id", "1kX2iCHEv0CADL8dSO1nMdW-V0NqIAiP3", "--output", annotation_file])
+        with open(annotation_file, "r") as f:
+            self.dataset = json.load(f)
+        self.all_relations = list()
+        for item in self.dataset:
+            item["image_path"] = os.path.join(image_dir, item["image_path"])
+            self.all_relations.append(item["relation_name"])
+        self.image_preprocess = image_preprocess
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, index):
+        test_case = self.dataset[index]
+        image = Image.open(test_case["image_path"]).convert('RGB')
+        # Get the bounding box that contains the relation. This is to remove the irrelevant details in the scene.
+        image = image.crop((test_case["bbox_x"], test_case["bbox_y"], test_case["bbox_x"] + test_case["bbox_w"], test_case["bbox_y"] + test_case["bbox_h"]))
+        if self.image_preprocess is not None:
+            image = self.image_preprocess(image)
+        # Each test case has a correct and incorrect caption.
+        true_caption = test_case["true_caption"]
+        false_caption = test_case["false_caption"]
+        item = edict({"image_options": [image], "caption_options": [false_caption, true_caption]})
+        return item
+    def download(self):
+        os.makedirs(self.root_dir, exist_ok=True)
+        image_zip_file = os.path.join(self.root_dir, "vgr_vga_images.zip")
+        subprocess.call(["gdown", "--no-cookies", "1qaPlrwhGNMrR3a11iopZUT_GPP_LrgP9", "--output", image_zip_file])
+        subprocess.call(["unzip", "vgr_vga_images.zip"], cwd=self.root_dir)
+    def evaluate_scores(self, scores):
+        """
+        Scores: N x 1 x 2, i.e. first caption is the perturbed one, second is the positive one
+        """
+        if isinstance(scores, tuple):
+            scores_i2t = scores[1]
+            scores_t2i = scores[0]
+        else:
+            scores_t2i = scores
+            scores_i2t = scores
+        metrics = {"Accuracy": None}
+        preds = np.argmax(np.squeeze(scores_i2t, axis=1), axis=-1)
+        correct_mask = (preds == 1)
+        metrics["Accuracy"] = np.mean(correct_mask)
+        all_relations = np.array(self.all_relations)
+        result_records = []
+        # Log the accuracy of all relations
+        for relation in np.unique(all_relations):
+            relation_mask = (all_relations == relation)
+            if relation_mask.sum() == 0:
+                continue
+            result_records.append({
+                "Relation": relation,
+                "Accuracy": correct_mask[relation_mask].mean(),
+                "Count": relation_mask.sum(),
+                "Dataset": "Visual Genome Relation"
+            })
+        return result_records
+class VG_Attribution(Dataset):
+    def __init__(self, image_preprocess, text_perturb_fn=None, image_perturb_fn=None, root_dir=ARO_ROOT, download=False):
+        '''
+        image_preprocess: a function that takes in a PIL image and returns a tensor.
+        text_perturb_fn: Not used for this dataset. Just for compatibility with other datasets.
+        image_perturb_fn: Not used for this dataset. Just for compatibility with other datasets.
+        root_dir: Directory for the VG-A dataset.
+        '''
+        self.root_dir = root_dir
+        annotation_file = os.path.join(root_dir, "visual_genome_attribution.json")
+        image_dir = os.path.join(root_dir, "images")
+        if not os.path.exists(image_dir):
+            print("Image Directory for VG_Attribution could not be found!")
+            if download:
+                self.download()
+            else:
+                raise RuntimeError("Please either download the dataset by letting `--download` or specify the correct directory.")
+        if not os.path.exists(annotation_file):
+            subprocess.call(["gdown", "--id", "13tWvOrNOLHxl3Rm9cR3geAdHx2qR3-Tw", "--output", annotation_file])
+        with open(annotation_file, "r") as f:
+            self.dataset = json.load(f)
+        for item in self.dataset:
+            item["image_path"] = os.path.join(image_dir, item["image_path"])
+        # Set of attributes in each test case
+        self.all_attributes = [f"{item['attributes'][0]}_{item['attributes'][1]}" for item in self.dataset]
+        self.image_preprocess = image_preprocess
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, index):
+        test_case = self.dataset[index]
+        image = Image.open(test_case["image_path"]).convert('RGB')
+        # Get the bounding box that contains the relation. This is to remove the irrelevant details in the scene.
+        image = image.crop((test_case["bbox_x"], test_case["bbox_y"], test_case["bbox_x"] + test_case["bbox_w"], test_case["bbox_y"] + test_case["bbox_h"]))
+        if self.image_preprocess is not None:
+            image = self.image_preprocess(image)
+        # Each test case has a correct and incorrect caption.
+        true_caption = test_case["true_caption"]
+        false_caption = test_case["false_caption"]
+        item = edict({"image_options": [image], "caption_options": [false_caption, true_caption]})
+        return item
+    def download(self):
+        os.makedirs(self.root_dir, exist_ok=True)
+        image_zip_file = os.path.join(self.root_dir, "vgr_vga_images.zip")
+        subprocess.call(["gdown", "--no-cookies",  "1qaPlrwhGNMrR3a11iopZUT_GPP_LrgP9", "--output", image_zip_file])
+        subprocess.call(["unzip", "vgr_vga_images.zip"], cwd=self.root_dir)
+    def evaluate_scores(self, scores):
+        """
+        Scores: N x 1 x 2, i.e. first caption is the perturbed one, second is the positive one
+        """
+        if isinstance(scores, tuple):
+            scores_i2t = scores[1]
+            scores_t2i = scores[0]
+        else:
+            scores_t2i = scores
+            scores_i2t = scores
+        preds = np.argmax(np.squeeze(scores_i2t, axis=1), axis=-1)
+        correct_mask = (preds == 1)
+        result_records = []
+        all_attributes = np.array(self.all_attributes)
+        for attr in np.unique(all_attributes):
+            attr_mask = (all_attributes == attr)
+            if attr_mask.sum() < 25:
+                continue
+            result_records.append({
+                "Attributes": attr,
+                "Accuracy": correct_mask[attr_mask].mean(),
+                "Count": attr_mask.sum(),
+                "Dataset": "Visual Genome Attribution"
+            })
+        return result_records
+class COCO_Order(Dataset):
+    def __init__(self, image_preprocess=None, root_dir=COCO_ROOT, max_words=30, split="test",
+                 image_perturb_fn=None, download=False):
+        """
+        COCO Order Dataset.
+        image_preprocess: image preprocessing function
+        root_dir: The directory of the coco dataset. This directory should contain test2014 files.
+        max_words: Cropping the caption to max_words.
+        split: 'val' or 'test'
+        image_perturb_fn: not used; for compatibility.
+        download: Whether to download the dataset if it does not exist.
+        """
+        shuffler = TextShuffler()
+        perturb_functions = [shuffler.shuffle_nouns_and_adj, shuffler.shuffle_allbut_nouns_and_adj,
+                             shuffler.shuffle_within_trigrams, shuffler.shuffle_trigrams]
+        self.root_dir = root_dir
+        if not os.path.exists(root_dir):
+            print("Directory for COCO could not be found!")
+            if download:
+                print("Downloading COCO now.")
+                self.download()
+            else:
+                raise RuntimeError("Please either download the dataset by letting `--download` or specify the correct directory.")
+        urls = {'val':'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json',
+                'test':'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json'}
+        filenames = {'val':'coco_karpathy_val.json','test':'coco_karpathy_test.json'}
+        download_url(urls[split],root_dir)
+        self.annotation = json.load(open(os.path.join(root_dir,filenames[split]),'r'))
+        self.image_preprocess = image_preprocess
+        self.image_root = root_dir
+        self.test_cases = []
+        for img_id, ann in tqdm(enumerate(self.annotation)):
+            for i, caption in enumerate(ann['caption']):
+                test_case = {}
+                test_case["image"] = ann["image"]
+                test_case["caption_options"] = [pre_caption(caption,max_words)]
+                for perturb_fn in perturb_functions:
+                    test_case["caption_options"].append(pre_caption(perturb_fn(caption), max_words))
+                self.test_cases.append(test_case)
+    def __len__(self):
+        return len(self.test_cases)
+    def __getitem__(self, index):
+        test_case = self.test_cases[index]
+        image_path = os.path.join(self.image_root, test_case["image"])
+        image = Image.open(image_path).convert('RGB')
+        if self.image_preprocess is not None:
+            image = self.image_preprocess(image)
+        item = edict({"image_options": [image], "caption_options": test_case["caption_options"]})
+        return item
+    def download(self):
+        import subprocess
+        os.makedirs(self.root_dir, exist_ok=True)
+        #subprocess.call(["wget", "http://images.cocodataset.org/zips/train2014.zip"], cwd=self.root_dir)
+        #subprocess.call(["unzip", "train2014.zip"], cwd=self.root_dir)
+        subprocess.call(["wget", "http://images.cocodataset.org/zips/val2014.zip"], cwd=self.root_dir)
+        subprocess.call(["unzip", "val2014.zip"], cwd=self.root_dir)
+        subprocess.call(["wget", "http://images.cocodataset.org/zips/test2014.zip"], cwd=self.root_dir)
+        subprocess.call(["unzip", "test2014.zip"], cwd=self.root_dir)
+    def evaluate_scores(self, scores):
+        if isinstance(scores, tuple):
+            scores_i2t = scores[0]
+            scores_t2i = scores[1].T # Make it N_ims x N_text
+        else:
+            scores_t2i = scores
+            scores_i2t = scores
+        preds = np.argmax(np.squeeze(scores_i2t, axis=1), axis=-1)
+        correct_mask = (preds == 0)
+        records = [{"Precision@1": np.mean(correct_mask)}]
+        return records
+class Flickr30k_Order(Dataset):
+    def __init__(self, image_preprocess, split, root_dir=FLICKR_ROOT, max_words=30,
+                 *args, **kwargs):
+        """
+        image_preprocess: image preprocessing function
+        split: 'val' or 'test'
+        root_dir: The directory of the flickr30k images. This should contain the `flickr30k-images` directory that \
+            contains all the images.
+        """
+        urls = {'val':'https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json',
+                'test':'https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json'}
+        filenames = {'val':'flickr30k_val.json','test':'flickr30k_test.json'}
+        if not os.path.exists(root_dir):
+            print("Directory for Flickr30k could not be found!")
+            flickr_url = "https://forms.illinois.edu/sec/229675"
+            raise RuntimeError(f"You need to manually sign up and download the dataset from {flickr_url} and place it in the `root_dir`.")
+        download_url(urls[split],root_dir)
+        self.annotation = json.load(open(os.path.join(root_dir,filenames[split]),'r'))
+        self.image_preprocess = image_preprocess
+        self.root_dir = root_dir
+        self.test_cases = []
+        shuffler = TextShuffler()
+        perturb_functions = [shuffler.shuffle_nouns_and_adj, shuffler.shuffle_allbut_nouns_and_adj,
+                             shuffler.shuffle_within_trigrams, shuffler.shuffle_trigrams]
+        for img_id, ann in tqdm(enumerate(self.annotation)):
+            for i, caption in enumerate(ann['caption']):
+                test_case = {}
+                test_case["image"] = ann["image"]
+                test_case["caption_options"] = [pre_caption(caption,max_words)]
+                for perturb_fn in perturb_functions:
+                    test_case["caption_options"].append(pre_caption(perturb_fn(caption), max_words))
+                self.test_cases.append(test_case)
+    def __len__(self):
+        return len(self.test_cases)
+    def __getitem__(self, index):
+        test_case = self.test_cases[index]
+        image_path = os.path.join(self.root_dir, test_case["image"])
+        image = Image.open(image_path).convert('RGB')
+        if self.image_preprocess is not None:
+            image = self.image_preprocess(image)
+        item = edict({"image_options": [image], "caption_options": test_case["caption_options"]})
+        return item
+    def evaluate_scores(self, scores):
+        if isinstance(scores, tuple):
+            scores_i2t = scores[0]
+            scores_t2i = scores[1].T # Make it N_ims x N_text
+        else:
+            scores_t2i = scores
+            scores_i2t = scores
+        preds = np.argmax(np.squeeze(scores_i2t, axis=1), axis=-1)
+        correct_mask = (preds == 0)
+        result_records = [{"Precision@1": np.mean(correct_mask)}]
+        return result_records
+def get_visual_genome_relation(image_preprocess, text_perturb_fn=None, image_perturb_fn=None, download=False):
+    return VG_Relation(image_preprocess=image_preprocess, text_perturb_fn=text_perturb_fn, image_perturb_fn=image_perturb_fn, download=download)
+def get_visual_genome_attribution(image_preprocess, text_perturb_fn=None, image_perturb_fn=None, download=False):
+    return VG_Attribution(image_preprocess=image_preprocess, text_perturb_fn=text_perturb_fn,
+                   image_perturb_fn=image_perturb_fn, download=download)
+def get_coco_order(image_preprocess, image_perturb_fn, text_perturb_fn, max_words=30, download=False, root_dir=COCO_ROOT, split="test"):
+    return COCO_Order(root_dir=root_dir, split=split, image_preprocess=image_preprocess, image_perturb_fn=image_perturb_fn, max_words=max_words,
+                            download=download)
+def get_flickr30k_order(image_preprocess, image_perturb_fn, text_perturb_fn, max_words=30, download=False, root_dir=FLICKR_ROOT, split="test"):
+    return Flickr30k_Order(root_dir=root_dir, split=split, image_preprocess=image_preprocess, image_perturb_fn=image_perturb_fn, max_words=max_words,
+                            download=download)

multimodal/build/lib/open_flamingo/eval/dataset_zoo/constants.py ADDED Viewed

	@@ -0,0 +1,3 @@

+ARO_ROOT = "~/.cache/prerelease_bow"
+COCO_ROOT = "~/.cache/coco/2014"
+FLICKR_ROOT = "~/.cache/flickr30k/images"

multimodal/build/lib/open_flamingo/eval/dataset_zoo/perturbations.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import torch
+import random
+import numpy as np
+from functools import partial
+import torch.nn.functional as nnf
+from torchvision import transforms as T
+# A lot of the approaches here are inspired from the wonderful paper from O'Connor and Andreas 2021.
+# https://github.com/lingo-mit/context-ablations
+def get_text_perturb_fn(text_perturb_fn):
+    if text_perturb_fn == "shuffle_nouns_and_adj":
+        return shuffle_nouns_and_adj
+    elif text_perturb_fn == "shuffle_allbut_nouns_and_adj":
+        return shuffle_allbut_nouns_and_adj
+    elif text_perturb_fn == "shuffle_within_trigrams":
+        return shuffle_within_trigrams
+    elif text_perturb_fn == "shuffle_all_words":
+        return shuffle_all_words
+    elif text_perturb_fn == "shuffle_trigrams":
+        return shuffle_trigrams
+    elif text_perturb_fn is None:
+        return None
+    else:
+        print("Unknown text perturbation function: {}, returning None".format(text_perturb_fn))
+        return None
+def get_image_perturb_fn(image_perturb_fn):
+    if image_perturb_fn == "shuffle_rows_4":
+        return partial(shuffle_rows, n_rows=4)
+    elif image_perturb_fn == "shuffle_patches_9":
+        return partial(shuffle_patches, n_ratio=3)
+    elif image_perturb_fn == "shuffle_cols_4":
+        return partial(shuffle_columns, n_cols=4)
+    elif image_perturb_fn is None:
+        return None
+    else:
+        print("Unknown image perturbation function: {}, returning None".format(image_perturb_fn))
+        return None
+class TextShuffler:
+    def __init__(self):
+        import spacy
+        self.nlp = spacy.load("en_core_web_sm")
+    def shuffle_nouns_and_adj(self, ex):
+        doc = self.nlp(ex)
+        tokens = [token.text for token in doc]
+        text = np.array(tokens)
+        noun_idx = [i for i, token in enumerate(doc) if token.tag_ in ['NN', 'NNS', 'NNP', 'NNPS']]
+        ## Finding adjectives
+        adjective_idx = [i for i, token in enumerate(doc) if token.tag_ in ['JJ', 'JJR', 'JJS']]
+        ## Shuffle the nouns of the text
+        text[noun_idx] = np.random.permutation(text[noun_idx])
+        ## Shuffle the adjectives of the text
+        text[adjective_idx] = np.random.permutation(text[adjective_idx])
+        return " ".join(text)
+    def shuffle_all_words(self, ex):
+        return " ".join(np.random.permutation(ex.split(" ")))
+    def shuffle_allbut_nouns_and_adj(self, ex):
+        doc = self.nlp(ex)
+        tokens = [token.text for token in doc]
+        text = np.array(tokens)
+        noun_adj_idx = [i for i, token in enumerate(doc) if token.tag_ in ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']]
+        ## Finding adjectives
+        else_idx = np.ones(text.shape[0])
+        else_idx[noun_adj_idx] = 0
+        else_idx = else_idx.astype(bool)
+        ## Shuffle everything that are nouns or adjectives
+        text[else_idx] = np.random.permutation(text[else_idx])
+        return " ".join(text)
+    def get_trigrams(self, sentence):
+        # Taken from https://github.com/lingo-mit/context-ablations/blob/478fb18a9f9680321f0d37dc999ea444e9287cc0/code/transformers/src/transformers/data/data_augmentation.py
+        trigrams = []
+        trigram = []
+        for i in range(len(sentence)):
+            trigram.append(sentence[i])
+            if i % 3 == 2:
+                trigrams.append(trigram[:])
+                trigram = []
+        if trigram:
+            trigrams.append(trigram)
+        return trigrams
+    def trigram_shuffle(self, sentence):
+        trigrams = self.get_trigrams(sentence)
+        for trigram in trigrams:
+            random.shuffle(trigram)
+        return " ".join([" ".join(trigram) for trigram in trigrams])
+    def shuffle_within_trigrams(self, ex):
+        import nltk
+        tokens = nltk.word_tokenize(ex)
+        shuffled_ex = self.trigram_shuffle(tokens)
+        return shuffled_ex
+    def shuffle_trigrams(self, ex):
+        import nltk
+        tokens = nltk.word_tokenize(ex)
+        trigrams = self.get_trigrams(tokens)
+        random.shuffle(trigrams)
+        shuffled_ex = " ".join([" ".join(trigram) for trigram in trigrams])
+        return shuffled_ex
+def _handle_image_4shuffle(x):
+    return_image = False
+    if not isinstance(x, torch.Tensor):
+        # print(f"x is not a tensor: {type(x)}. Trying to handle but fix this or I'll annoy you with this log")
+        t = torch.tensor(np.array(x)).unsqueeze(dim=0).float()
+        t = t.permute(0, 3, 1, 2)
+        return_image = True
+        return t, return_image
+    if len(x.shape) != 4:
+        #print("You did not send a tensor of shape NxCxWxH. Unsqueezing not but fix this or I'll annoy you with this log")
+        return x.unsqueeze(dim=0), return_image
+    else:
+        # Good boi
+        return x, return_image
+def shuffle_rows(x, n_rows=7):
+    """
+    Shuffle the rows of the image tensor where each row has a size of 14 pixels.
+    Tensor is of shape N x C x W x H
+    """
+    x, return_image = _handle_image_4shuffle(x)
+    patch_size = x.shape[-2]//n_rows
+    u = nnf.unfold(x, kernel_size=(patch_size, x.shape[-1]), stride=patch_size, padding=0)
+    # permute the patches of each image in the batch
+    pu = torch.cat([b_[:, torch.randperm(b_.shape[-1])][None,...] for b_ in u], dim=0)
+    # fold the permuted patches back together
+    f = nnf.fold(pu, x.shape[-2:], kernel_size=(patch_size, x.shape[-1]), stride=patch_size, padding=0)
+    image = f.squeeze() # C W H
+    if return_image:
+        return T.ToPILImage()(image.type(torch.uint8))
+    else:
+        return image
+def shuffle_columns(x, n_cols=7):
+    """
+    Shuffle the columns of the image tensor where we'll have n_cols columns.
+    Tensor is of shape N x C x W x H
+    """
+    x, return_image = _handle_image_4shuffle(x)
+    patch_size = x.shape[-1]//n_cols
+    u = nnf.unfold(x, kernel_size=(x.shape[-2], patch_size), stride=patch_size, padding=0)
+    # permute the patches of each image in the batch
+    pu = torch.cat([b_[:, torch.randperm(b_.shape[-1])][None,...] for b_ in u], dim=0)
+    # fold the permuted patches back together
+    f = nnf.fold(pu, x.shape[-2:], kernel_size=(x.shape[-2], patch_size), stride=patch_size, padding=0)
+    image = f.squeeze() # C W H
+    if return_image:
+        return T.ToPILImage()(image.type(torch.uint8))
+    else:
+        return image
+def shuffle_patches(x, n_ratio=4):
+    """
+    Shuffle the rows of the image tensor where each row has a size of 14 pixels.
+    Tensor is of shape N x C x W x H
+    """
+    x, return_image = _handle_image_4shuffle(x)
+    patch_size_x = x.shape[-2]//n_ratio
+    patch_size_y = x.shape[-1]//n_ratio
+    u = nnf.unfold(x, kernel_size=(patch_size_x, patch_size_y), stride=(patch_size_x, patch_size_y), padding=0)
+    # permute the patches of each image in the batch
+    pu = torch.cat([b_[:, torch.randperm(b_.shape[-1])][None,...] for b_ in u], dim=0)
+    # fold the permuted patches back together
+    f = nnf.fold(pu, x.shape[-2:], kernel_size=(patch_size_x, patch_size_y), stride=(patch_size_x, patch_size_y), padding=0)
+    image = f.squeeze() # C W H
+    if return_image:
+        return T.ToPILImage()(image.type(torch.uint8))
+    else:
+        return image

multimodal/build/lib/open_flamingo/eval/dataset_zoo/retrieval.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import os
+import re
+import json
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from torch.utils.data import Dataset
+from torchvision.datasets.utils import download_url
+from .constants import COCO_ROOT, FLICKR_ROOT
+from .utils import AverageMeter
+def pre_caption(caption,max_words=50):
+    caption = re.sub(
+        r"([.!\"()*#:;~])",
+        ' ',
+        caption.lower(),
+    )
+    caption = re.sub(
+        r"\s{2,}",
+        ' ',
+        caption,
+    )
+    caption = caption.rstrip('\n')
+    caption = caption.strip(' ')
+    #truncate caption
+    caption_words = caption.split(' ')
+    if len(caption_words)>max_words:
+        caption = ' '.join(caption_words[:max_words])
+    return caption
+class COCO_Retrieval(Dataset):
+    def __init__(self, image_preprocess=None, root_dir=COCO_ROOT, max_words=30, split="test",
+                 image_perturb_fn=None, download=False):
+        """
+        COCO Retrieval Dataset.
+        image_preprocess: image preprocessing function
+        root_dir: The directory of the coco dataset. This directory should contain test2014 files.
+        max_words: Cropping the caption to max_words.
+        split: 'val' or 'test'
+        image_perturb_fn: image perturbation function for patch permutation experiments.
+        download: Whether to download the dataset if it does not exist.
+        """
+        self.root_dir = root_dir
+        if not os.path.exists(root_dir):
+            print("Directory for COCO could not be found!")
+            if download:
+                print("Downloading COCO now.")
+                self.download()
+            else:
+                raise RuntimeError("Please either download the dataset by letting `--download` or specify the correct directory.")
+        urls = {'val':'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json',
+                'test':'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json'}
+        filenames = {'val':'coco_karpathy_val.json','test':'coco_karpathy_test.json'}
+        download_url(urls[split],root_dir)
+        self.annotation = json.load(open(os.path.join(root_dir,filenames[split]),'r'))
+        self.image_preprocess = image_preprocess
+        self.image_perturb_fn = image_perturb_fn
+        self.image_root = root_dir
+        self.text = []
+        self.image = []
+        self.txt2img = {}
+        self.img2txt = {}
+        txt_id = 0
+        for img_id, ann in enumerate(self.annotation):
+            self.image.append(ann['image'])
+            self.img2txt[img_id] = []
+            for i, caption in enumerate(ann['caption']):
+                self.text.append(pre_caption(caption,max_words))
+                self.img2txt[img_id].append(txt_id)
+                self.txt2img[txt_id] = img_id
+                txt_id += 1
+    def __len__(self):
+        return len(self.annotation)
+    def __getitem__(self, index):
+        image_path = os.path.join(self.image_root, self.annotation[index]['image'])
+        image = Image.open(image_path).convert('RGB')
+        if self.image_preprocess is not None:
+            image = self.image_preprocess(image)
+        if self.image_perturb_fn is not None:
+            image = self.image_perturb_fn(image)
+        return {"image": image, "idx": index}
+    def download(self):
+        import subprocess
+        os.makedirs(self.root_dir, exist_ok=True)
+        #subprocess.call(["wget", "http://images.cocodataset.org/zips/train2014.zip"], cwd=self.root_dir)
+        #subprocess.call(["unzip", "train2014.zip"], cwd=self.root_dir)
+        subprocess.call(["wget", "http://images.cocodataset.org/zips/val2014.zip"], cwd=self.root_dir)
+        subprocess.call(["unzip", "val2014.zip"], cwd=self.root_dir)
+        subprocess.call(["wget", "http://images.cocodataset.org/zips/test2014.zip"], cwd=self.root_dir)
+        subprocess.call(["unzip", "test2014.zip"], cwd=self.root_dir)
+    def evaluate_scores(self, scores):
+        if isinstance(scores, tuple):
+            scores_i2t = scores[0]
+            scores_t2i = scores[1].T # Make it N_ims x N_text
+        else:
+            scores_t2i = scores
+            scores_i2t = scores
+        print(f"COCO results across {scores_i2t.shape} samples. ")
+        prec_at_1 = AverageMeter()
+        prec_at_5 = AverageMeter()
+        # Text retrieval
+        tqdm_iterator = tqdm(range(len(self.img2txt)))
+        for i in tqdm_iterator:
+            top5_captions = np.argsort(scores_i2t[i])[-5:]
+            true_captions = self.img2txt[i]
+            prec_at_1.update(len(set(true_captions) & set(top5_captions[-1:]))>0)
+            prec_at_5.update(len(set(true_captions) & set(top5_captions))>0)
+            tqdm_iterator.set_description(f"Text Retrieval Prec@1: {prec_at_1.avg:.3f}, Prec@5: {prec_at_5.avg:.3f}")
+        # Image Retrieval
+        image_prec_at_1 = AverageMeter()
+        image_prec_at_5 = AverageMeter()
+        tqdm_iterator = tqdm(range(len(self.txt2img)))
+        for i in tqdm_iterator:
+            top5_images = np.argsort(scores_t2i[:, i])[-5:]
+            true_image = self.txt2img[i]
+            image_prec_at_1.update(true_image in top5_images[-1:])
+            image_prec_at_5.update(true_image in top5_images)
+            tqdm_iterator.set_description(f"Image Retrieval Prec@1: {image_prec_at_1.avg:.3f}, Prec@5: {image_prec_at_5.avg:.3f}")
+        records = [{"ImagePrec@1": image_prec_at_1.avg, "ImagePrec@5": image_prec_at_5.avg, "TextPrec@1": prec_at_1.avg, "TextPrec@5": prec_at_5.avg}]
+        return records
+class Flickr30k_Retrieval(Dataset):
+    def __init__(self, image_preprocess, split, root_dir=FLICKR_ROOT, max_words=30,
+                 image_perturb_fn=None, *args, **kwargs):
+        '''
+        Flickr30k dataset for retrieval.
+        image_preprocess: image preprocessing function
+        root_dir: The directory of the coco dataset. This directory should contain test2014 files.
+        max_words: Cropping the caption to max_words.
+        split: 'val' or 'test'
+        image_perturb_fn: image perturbation function for patch permutation experiments.
+        download: Whether to download the dataset if it does not exist.
+        '''
+        urls = {'val':'https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json',
+                'test':'https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json'}
+        filenames = {'val':'flickr30k_val.json','test':'flickr30k_test.json'}
+        if not os.path.exists(root_dir):
+            print("Directory for Flickr30k could not be found!")
+            flickr_url = "https://forms.illinois.edu/sec/229675"
+            raise RuntimeError(f"You need to manually sign up and download the dataset from {flickr_url} and place it in the `root_dir`.")
+        download_url(urls[split],root_dir)
+        self.annotation = json.load(open(os.path.join(root_dir,filenames[split]),'r'))
+        self.image_preprocess = image_preprocess
+        self.image_perturb_fn = image_perturb_fn
+        self.root_dir = root_dir
+        self.text = []
+        self.image = []
+        self.txt2img = {}
+        self.img2txt = {}
+        txt_id = 0
+        for img_id, ann in enumerate(self.annotation):
+            self.image.append(ann['image'])
+            self.img2txt[img_id] = []
+            for i, caption in enumerate(ann['caption']):
+                self.text.append(pre_caption(caption,max_words))
+                self.img2txt[img_id].append(txt_id)
+                self.txt2img[txt_id] = img_id
+                txt_id += 1
+    def __len__(self):
+        return len(self.annotation)
+    def __getitem__(self, index):
+        image_path = os.path.join(self.root_dir, self.annotation[index]['image'])
+        image = Image.open(image_path).convert('RGB')
+        if self.image_preprocess is not None:
+            image = self.image_preprocess(image)
+        if self.image_perturb_fn is not None:
+            image = self.image_perturb_fn(image)
+        return {"image": image, "idx": index}
+    def evaluate_scores(self, scores):
+        if isinstance(scores, tuple):
+            scores_i2t = scores[0]
+            scores_t2i = scores[1].T # Make it N_ims x N_text
+        else:
+            scores_t2i = scores
+            scores_i2t = scores
+        print(f"Flickr30k Retrieval results across {scores_i2t.shape} samples. ")
+        prec_at_1 = AverageMeter()
+        prec_at_5 = AverageMeter()
+        # Text retrieval
+        tqdm_iterator = tqdm(range(len(self.img2txt)))
+        for i in tqdm_iterator:
+            top5_captions = np.argsort(scores_i2t[i])[-5:]
+            true_captions = self.img2txt[i]
+            prec_at_1.update(len(set(true_captions) & set(top5_captions[-1:]))>0)
+            prec_at_5.update(len(set(true_captions) & set(top5_captions))>0)
+            tqdm_iterator.set_description(f"Text Retrieval Prec@1: {prec_at_1.avg:.3f}, Prec@5: {prec_at_5.avg:.3f}")
+        # Image Retrieval
+        image_prec_at_1 = AverageMeter()
+        image_prec_at_5 = AverageMeter()
+        tqdm_iterator = tqdm(range(len(self.txt2img)))
+        for i in tqdm_iterator:
+            top5_images = np.argsort(scores_t2i[:, i])[-5:]
+            true_image = self.txt2img[i]
+            image_prec_at_1.update(true_image in top5_images[-1:])
+            image_prec_at_5.update(true_image in top5_images)
+            tqdm_iterator.set_description(f"Image Retrieval Prec@1: {image_prec_at_1.avg:.3f}, Prec@5: {image_prec_at_5.avg:.3f}")
+        records = [{"ImagePrec@1": image_prec_at_1.avg, "ImagePrec@5": image_prec_at_5.avg, "TextPrec@1": prec_at_1.avg, "TextPrec@5": prec_at_5.avg}]
+        return records
+    def download(self):
+        raise NotImplementedError("Flickr30k dataset is not available for download.")
+def get_coco_retrieval(image_preprocess, image_perturb_fn, text_perturb_fn, max_words=30, download=False, root_dir=COCO_ROOT, split="test"):
+    dataset = COCO_Retrieval(root_dir=root_dir, split=split, image_preprocess=image_preprocess, image_perturb_fn=image_perturb_fn, max_words=max_words,
+                            download=download)
+    return dataset
+def get_flickr30k_retrieval(image_preprocess, image_perturb_fn, text_perturb_fn, max_words=30, download=False, root_dir=FLICKR_ROOT, split="test"):
+    dataset = Flickr30k_Retrieval(root_dir=root_dir, split=split, image_preprocess=image_preprocess, image_perturb_fn=image_perturb_fn, max_words=max_words,
+                            download=download)
+    return dataset

multimodal/build/lib/open_flamingo/eval/dataset_zoo/utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count

multimodal/build/lib/open_flamingo/eval/eval_datasets.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import json
+import os
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision.datasets import ImageFolder
+from open_flamingo.eval.imagenet_utils import IMAGENET_1K_CLASS_ID_TO_LABEL
+class COCOFlickrDataset(Dataset):
+    def __init__(
+        self,
+        image_dir_path,
+        annotations_path,
+        is_flickr=False,
+    ):
+        self.image_dir_path = image_dir_path
+        self.annotations = json.load(open(annotations_path))["annotations"]
+        self.is_flickr = is_flickr
+    def __len__(self):
+        return len(self.annotations)
+    def get_img_path(self, idx):
+        if self.is_flickr:
+            return f"{self.image_dir_path}/{self.annotations[idx]['image_id']}.jpg"
+        else:
+            return f"{self.image_dir_path}/{self.annotations[idx]['image_id']:012d}.jpg"
+    def __getitem__(self, idx):
+        image = Image.open(self.get_img_path(idx))
+        caption = self.annotations[idx]["caption"]
+        return {
+            "image": image,
+            "caption": caption,
+            "image_id": self.annotations[idx]["image_id"],
+        }
+class VQADataset(Dataset):
+    def __init__(
+        self,
+        image_dir_path="/mmfs1/gscratch/efml/anasa2/data/vqav2/train2014/",
+        question_path="/mmfs1/gscratch/efml/anasa2/data/vqav2/v2_OpenEnded_mscoco_train2014_questions.json",
+        annotations_path="/mmfs1/gscratch/efml/anasa2/data/vqav2/v2_mscoco_train2014_annotations.json",
+        vqa_dataset="vqa",
+    ):
+        self.questions = json.load(open(question_path, "r"))["questions"]
+        self.answers = json.load(open(annotations_path, "r"))["annotations"]
+        self.image_dir_path = image_dir_path
+        self.vqa_dataset = vqa_dataset
+    def __len__(self):
+        return len(self.questions)
+    def get_img_path(self, question):
+        if self.vqa_dataset == "vqa":
+            return os.path.join(
+                self.image_dir_path, f"COCO_val2014_{question['image_id']:012d}.jpg"
+            )
+        elif self.vqa_dataset == "ok_vqa":
+            return os.path.join(
+                self.image_dir_path, f"COCO_val2014_{question['image_id']:012d}.jpg"
+            )
+        else:
+            raise Exception(f"Unknown VQA dataset {self.vqa_dataset}")
+    def __getitem__(self, idx):
+        question = self.questions[idx]
+        answers = self.answers[idx]
+        img_path = self.get_img_path(question)
+        image = Image.open(img_path)
+        return {
+            "image": image,
+            "question": question["question"],
+            "answers": [a["answer"] for a in answers["answers"]],
+            "question_id": question["question_id"],
+        }
+class ImageNetDataset(ImageFolder):
+    """Class to represent the ImageNet1k dataset."""
+    def __init__(self, root, **kwargs):
+        super().__init__(root=root, **kwargs)
+    def __getitem__(self, idx):
+        sample, target = super().__getitem__(idx)
+        target_label = IMAGENET_1K_CLASS_ID_TO_LABEL[target]
+        return {
+            "image": sample,
+            "class_id": target,  # numeric ID of the ImageNet class
+            "class_name": target_label,  # human-readable name of ImageNet class
+        }
+if __name__ == "__main__":
+    gqa_dataset = GQADataset()
+    for sample in gqa_dataset:
+        print(sample)

multimodal/build/lib/open_flamingo/eval/evaluate.py ADDED Viewed

	@@ -0,0 +1,1435 @@

+import argparse
+import json
+from math import ceil
+import os
+import random
+import uuid
+from collections import defaultdict
+from typing import Callable
+import time
+import cv2
+import webdataset as wds
+from sklearn.metrics import recall_score, average_precision_score
+import more_itertools
+import numpy as np
+import torch
+from coco_metric import compute_cider, postprocess_captioning_generation
+from eval_datasets import VQADataset
+from tqdm import tqdm
+from collections import Counter
+from vqa_metric import compute_vqa_accuracy, compute_gqa_accuracy
+from open_flamingo.eval.classification import (
+    compute_per_sample_probs,
+    compute_per_sample_loss,
+)
+from open_flamingo.eval.imagenet_utils import (
+    openai_imagenet_classnames,
+    IMAGENET_1K_CLASS_ID_TO_LABEL,
+)
+from open_flamingo.src.factory import create_model_and_transforms
+from PIL import Image
+from io import BytesIO
+import base64
+from open_flamingo.train.distributed import init_distributed_device, world_info_from_env
+import string
+from open_flamingo.eval.task.reg import evaluate_reg
+from open_flamingo.eval.task.gqa import GQADataset
+from open_flamingo.eval.task.vl_checklist import evaluate_vlc
+from open_flamingo.eval.task.crepe import evaluate_crepe
+from open_flamingo.eval.task.caption import evaluate_coco_flickr
+from open_flamingo.eval.task.utils import is_correct, get_iou
+from open_flamingo.eval.task.cola import evaluate_cola
+from open_flamingo.eval.task.gqa import evaluate_gqa
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+parser = argparse.ArgumentParser()
+parser.add_argument("--lm_path", type=str, default="facebook/opt-1.3b")
+parser.add_argument("--lm_tokenizer_path", type=str, default="facebook/opt-30b")
+parser.add_argument("--vision_encoder_path", default="ViT-L-14", type=str)
+parser.add_argument("--vision_encoder_pretrained", default="openai", type=str)
+parser.add_argument("--checkpoint_path", type=str, required=True)
+parser.add_argument(
+    "--results_file", type=str, default=None, help="JSON file to save results"
+)
+# Trial arguments
+parser.add_argument("--shots", nargs="+", default=[0, 4, 8, 16, 32], type=int)
+parser.add_argument(
+    "--num_trials",
+    type=int,
+    default=1,
+    help="Number of trials to run for each shot using different demonstrations",
+)
+parser.add_argument(
+    "--trial_seeds",
+    nargs="+",
+    default=[0],
+    help="Seeds to use for each trial for picking demonstrations and eval sets",
+)
+parser.add_argument(
+    "--num_samples", type=int, default=5000, help="Number of samples to evaluate on"
+)
+parser.add_argument("--batch_size", type=int, default=8)
+# Per-dataset evaluation flags
+parser.add_argument(
+    "--eval_coco",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on COCO.",
+)
+parser.add_argument(
+    "--eval_vqav2",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on VQAV2.",
+)
+parser.add_argument(
+    "--eval_ok_vqa",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on OK-VQA.",
+)
+parser.add_argument(
+    "--eval_imagenet",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on ImageNet.",
+)
+parser.add_argument(
+    "--eval_flickr30",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on Flickr30.",
+)
+parser.add_argument(
+    "--eval_refcoco",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on RefCOCO.",
+)
+# Dataset arguments
+## Flickr30 Dataset
+parser.add_argument(
+    "--flickr_image_dir_path",
+    type=str,
+    help="Path to the flickr30/flickr30k_images directory.",
+    default=None,
+)
+parser.add_argument(
+    "--flickr_annotations_json_path",
+    type=str,
+    help="Path to the dataset_flickr30k_coco_style.json file.",
+    default=None,
+)
+## COCO Dataset
+parser.add_argument(
+    "--coco_image_dir_path",
+    type=str,
+    help="Path to the flickr30/flickr30k_images directory.",
+    default=None,
+)
+parser.add_argument(
+    "--coco_annotations_json_path",
+    type=str,
+    default=None,
+)
+## VQAV2 Dataset
+parser.add_argument(
+    "--vqav2_image_dir_path",
+    type=str,
+    default=None,
+)
+parser.add_argument(
+    "--vqav2_questions_json_path",
+    type=str,
+    default=None,
+)
+parser.add_argument(
+    "--vqav2_annotations_json_path",
+    type=str,
+    default=None,
+)
+## OK-VQA Dataset
+parser.add_argument(
+    "--ok_vqa_image_dir_path",
+    type=str,
+    help="Path to the vqav2/train2014 directory.",
+    default=None,
+)
+parser.add_argument(
+    "--ok_vqa_questions_json_path",
+    type=str,
+    help="Path to the v2_OpenEnded_mscoco_train2014_questions.json file.",
+    default=None,
+)
+parser.add_argument(
+    "--ok_vqa_annotations_json_path",
+    type=str,
+    help="Path to the v2_mscoco_train2014_annotations.json file.",
+    default=None,
+)
+## Imagenet dataset
+parser.add_argument("--imagenet_root", type=str, default="/tmp")
+## RefCOCO dataset
+parser.add_argument("--refcoco_tsvfile", type=str, default=None)
+parser.add_argument(
+    "--location_token_num",
+    default=1000,
+    type=int,
+)
+# distributed training
+parser.add_argument(
+    "--dist-url",
+    default="env://",
+    type=str,
+    help="url used to set up distributed training",
+)
+parser.add_argument(
+    "--dist-backend", default="nccl", type=str, help="distributed backend"
+)
+parser.add_argument(
+    "--horovod",
+    default=False,
+    action="store_true",
+    help="Use horovod for distributed training.",
+)
+parser.add_argument(
+    "--no-set-device-rank",
+    default=False,
+    action="store_true",
+    help="Don't set device index from local rank (when CUDA_VISIBLE_DEVICES restricted to one per proc).",
+)
+parser.add_argument(
+    "--dist",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--lora",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--lora_r",
+    default=16,
+    type=int,
+    required=False,
+)
+parser.add_argument(
+    "--legacy",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--special",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--id",
+    default=0,
+    type=int,
+    required=False,
+)
+parser.add_argument(
+    "--eval_gqa",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--use_sam",
+    default=None,
+    type=str,
+    required=False,
+)
+parser.add_argument(
+    "--add_visual_token",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--use_format_v2",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_aro",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_pisc",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_reg",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_vlc",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_crepe",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_cola",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--level",
+    default=4,
+    type=int,
+)
+parser.add_argument(
+    "--type",
+    default="swap",
+    type=str,
+)
+parser.add_argument(
+    "--choose_left_right",
+    default=False,
+    action="store_true",
+)
+class OKVQAPostProcess():
+    def __init__(self):
+        self._lemmatizer = None
+    def _lemmatize(self, answers):
+        def apply(answer):
+            doc = self.lemmatizer(answer)
+            words = []
+            for token in doc:
+                if token.pos_ in ["NOUN", "VERB"]:
+                    words.append(token.lemma_)
+                else:
+                    words.append(token.text)
+            answer = " ".join(words)
+            return answer
+        return [apply(answer) for answer in answers]
+    @property
+    def lemmatizer(self):
+        if self._lemmatizer is None:
+            try:
+                import spacy
+                self._lemmatizer = spacy.load("en_core_web_sm")
+            except ImportError:
+                logging.error(
+                    """
+                    Please install spacy and en_core_web_sm model to apply lemmatization.
+                    python -m spacy download en_core_web_sm
+                    OR
+                    import spacy.cli
+                    spacy.cli.download("en_core_web_sm")
+                    """
+                )
+                exit(1)
+        return self._lemmatizer
+def main():
+    args = parser.parse_args()
+    if args.dist:
+        args.local_rank, args.rank, args.world_size = world_info_from_env()
+        print(f"local_rank: {args.local_rank} rank: {args.rank} world_size: {args.world_size}")
+        device_id = init_distributed_device(args)
+    else:
+        args.rank = 0
+        args.world_size = 1
+        print(f"rank: {args.rank} world_size: {args.world_size}")
+    if "sam" in args.checkpoint_path:
+        args.use_sam = "vit_l"
+    args.add_visual_token = True
+    if "lora" in args.checkpoint_path:
+        args.lora = True
+    args.add_pe = False
+    args.add_box = True
+    args.relation = False
+    args.enhance_data = False
+    args.use_format_v2 = True
+    import hashlib
+    args.id = hashlib.sha224(args.checkpoint_path.encode()).hexdigest()
+    # load model
+    flamingo, image_processor, tokenizer, vis_embed_size = create_model_and_transforms(
+        args.vision_encoder_path,
+        args.vision_encoder_pretrained,
+        args.lm_path,
+        args.lm_tokenizer_path,
+        location_token_num=args.location_token_num,
+        lora=args.lora,
+        lora_r=16,
+        use_sam=args.use_sam,
+        add_visual_token=args.add_visual_token,
+        use_format_v2=args.use_format_v2,
+        add_box=args.add_box,
+        add_pe=args.add_pe,
+        add_relation=args.relation,
+        enhance_data=args.enhance_data,
+    )
+    flamingo.use_format_v2 = args.use_format_v2
+    if args.special:
+        flamingo.special = True
+    else:
+        flamingo.special = False
+    if args.legacy:
+        flamingo.legacy = True
+        print("use legacy evaluation")
+    flamingo.step_num = int(args.checkpoint_path.split("/")[-1].split(".")[0].split("_")[-1])
+    flamingo.expr_name = args.checkpoint_path.split("/")[-2]
+    if args.rank == 0:
+        print("legacy", True if hasattr(flamingo, "legacy") else False)
+        print("step:", flamingo.step_num)
+        print("expr:", flamingo.expr_name)
+        print("use format v2:", flamingo.use_format_v2)
+        print(args)
+    checkpoint = torch.load(args.checkpoint_path, map_location="cpu")
+    model_state_dict = {}
+    for key in checkpoint["model_state_dict"].keys():
+        model_state_dict[key.replace("module.", "")] = checkpoint["model_state_dict"][key]
+    if "vision_encoder.logit_scale"in model_state_dict:
+        # previous checkpoint has some unnecessary weights
+        del model_state_dict["vision_encoder.logit_scale"]
+        del model_state_dict["vision_encoder.visual.proj"]
+        del model_state_dict["vision_encoder.visual.ln_post.weight"]
+        del model_state_dict["vision_encoder.visual.ln_post.bias"]
+    flamingo.load_state_dict(model_state_dict, strict=True)
+    results = defaultdict(list)
+    if args.eval_coco:
+        print("Evaluating on COCO...")
+        cider_score = evaluate_coco_flickr(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            batch_size=args.batch_size,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["coco"].append({"score": cider_score})
+    if args.eval_ok_vqa:
+        print("Evaluating on OK-VQA...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                ok_vqa_score = evaluate_vqa(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    image_dir_path=args.ok_vqa_image_dir_path,
+                    questions_json_path=args.ok_vqa_questions_json_path,
+                    annotations_json_path=args.ok_vqa_annotations_json_path,
+                    vqa_dataset="ok_vqa",
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+            results["ok_vqa"].append(
+                {"shots": shot, "score": ok_vqa_score}
+            )
+    if args.eval_vqav2:
+        print("Evaluating on VQAv2...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                vqa_score = evaluate_vqa(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    image_dir_path=args.vqav2_image_dir_path,
+                    questions_json_path=args.vqav2_questions_json_path,
+                    annotations_json_path=args.vqav2_annotations_json_path,
+                    vqa_dataset="vqa",
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+            results["vqav2"].append(
+                {"shots": shot, "score": vqa_score}
+            )
+    if args.eval_gqa:
+        print("Evaluating on GQA...")
+        gqa_score = evaluate_gqa(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            batch_size=args.batch_size,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["gqa"].append(
+            {"score": gqa_score}
+        )
+    if args.eval_refcoco:
+        print("Evaluating on RefCOCO...")
+        refcoco_score = evaluate_refcoco(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            batch_size=args.batch_size,
+            device=args.device,
+            tsvfile=args.refcoco_tsvfile,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["refcoco"].append(
+            {"score": refcoco_score}
+        )
+    if args.eval_aro:
+        print("Evaluating on ARO...")
+        aro_score = evaluate_aro(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+            choose_left_right=args.choose_left_right,
+        )
+        results["aro"].append(
+            {"score": aro_score}
+        )
+    if args.eval_pisc:
+        print("Evaluating on ARO...")
+        aro_score = evaluate_pisc(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            batch_size=args.batch_size,
+            device=args.device,
+            tsvfile=args.refcoco_tsvfile,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["pisc"].append(
+            {"score": aro_score}
+        )
+    if args.eval_reg:
+        print("Evaluating on Referring Expression Generation...")
+        cider = evaluate_reg(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["reg"].append(
+            {"score": cider}
+        )
+    if args.eval_vlc:
+        print("Evaluating on VL-checklist...")
+        vlc_score = evaluate_vlc(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["vlc"].append(
+            {"score": vlc_score}
+        )
+    if args.eval_crepe:
+        print("Evaluating on CREPE...")
+        crepe_score = evaluate_crepe(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+            level=args.level,
+            type=args.type,
+        )
+        results["crepe"].append(
+            {"score": crepe_score}
+        )
+    if args.eval_cola:
+        print("Evaluating on COLA...")
+        cola_score = evaluate_cola(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["cola"].append(
+            {"score": cola_score}
+        )
+def prepare_batch_images(batch, image_processor):
+    batch_images = None
+    for b in batch:
+        b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        if batch_images is None:
+            batch_images = b_image
+        else:
+            batch_images = torch.cat([batch_images, b_image], dim=0)
+    return batch_images
+def get_outputs(
+    model,
+    batch_images,
+    attention_mask,
+    max_generation_length,
+    min_generation_length,
+    num_beams,
+    length_penalty,
+    input_ids,
+    image_start_index_list=None,
+    image_nums=None,
+    bad_words_ids=None,
+):
+    with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+        outputs = model.generate(
+            batch_images,
+            input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_generation_length,
+            min_length=min_generation_length,
+            num_beams=num_beams,
+            length_penalty=length_penalty,
+            image_start_index_list=image_start_index_list,
+            image_nums=image_nums,
+            bad_words_ids=bad_words_ids,
+        )
+    outputs = outputs[:, len(input_ids[0]) :]
+    return outputs
+def evaluate_vqa(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    image_dir_path=None,
+    questions_json_path=None,
+    annotations_json_path=None,
+    vqa_dataset="vqa",
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    """
+    Evaluate a model on VQA datasets. Currently supports VQA v2.0.
+    Args:
+        model (nn.Module): model to evaluate
+        tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model
+        image_processor : image processor for the model
+        batch_size (int): batch size
+        image_dir_path (str): path to image directory
+        questions_json_path (str): path to questions json file
+        annotations_json_path (str): path to annotations json file
+        seed (int, optional): random seed. Defaults to 42.
+        max_generation_length (int, optional): max generation length. Defaults to 5.
+        num_beams (int, optional): number of beams to use for beam search. Defaults to 3.
+        length_penalty (float, optional): length penalty for beam search. Defaults to -2.0.
+        num_samples (int, optional): number of samples to evaluate on. Defaults to 5000 samples.
+        query_set_size (int, optional): size of the query set. Defaults to 2048.
+        num_shots (int, optional): number of shots to use. Defaults to 8.
+        device (int, optional): device to use. Defaults to -1 (cpu).
+        num_workers (int, optional): number of workers to use. Defaults to 4.
+        vqa_dataset (string): type of vqa dataset: currently supports vqa, ok_vqa. Defaults to vqa.
+    Returns:
+        float: accuracy score
+    """
+    if world_size > 1:
+        torch.distributed.barrier()
+    if vqa_dataset == "gqa":
+        eval_dataset = GQADataset()
+    else:
+        eval_dataset = VQADataset(
+            image_dir_path=image_dir_path,
+            question_path=questions_json_path,
+            annotations_path=annotations_json_path,
+            vqa_dataset=vqa_dataset,
+        )
+    postprocessor = OKVQAPostProcess()
+    try:
+        media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+        endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+        pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+        bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    except:
+        pass
+    def get_prompt(sample):
+        return f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:"
+        # return f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"
+    model.eval().cuda()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    if "peft" in lang_encoder_name:
+        lang_encoder_name = model.lang_encoder.base_model.model.__class__.__name__.lower()
+    predictions = []
+    tokenizer.padding_side = "left"
+    if world_size > 1:
+        torch.distributed.barrier()
+    this_tot = 0
+    for ii, batch in enumerate(more_itertools.chunked(
+        tqdm(eval_dataset, desc="Running inference", disable=(rank != 0)), batch_size
+    )):
+        if ii % world_size != rank:
+            continue
+        batch_images = prepare_batch_images(
+            batch=batch,
+            image_processor=image_processor,
+        ).cuda()
+        batch_text = [get_prompt(s) for s in batch]
+        encodings = tokenizer(
+            batch_text,
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            max_length=2000,
+        )
+        input_ids = encodings["input_ids"].cuda()
+        attention_mask = encodings["attention_mask"].cuda()
+        skip_special_tokens = True
+        if hasattr(model, "legacy") and model.legacy and "opt" in lang_encoder_name:
+            if rank == 0:
+                tqdm.write("use legacy model")
+            for i in range(len(input_ids)):
+                media_token_index = (input_ids[i] == media_token_id).nonzero()[0,0]
+                endofmedia_token_index = (input_ids[i] == endofmedia_token_id).nonzero()[0,0]
+                input_ids[i, media_token_index - 1] = media_token_id
+                input_ids[i, media_token_index] = pad_token_id
+                input_ids[i, endofmedia_token_index - 1] = endofmedia_token_id
+                input_ids[i, endofmedia_token_index] = bos_token_id
+        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+        image_start_index_list = [[x] for x in image_start_index_list]
+        image_nums = [1] * len(input_ids)
+        if "llama" in lang_encoder_name:
+            attention_mask[input_ids == 0] = 0
+        outputs = get_outputs(
+            model=model,
+            batch_images=batch_images,
+            attention_mask=attention_mask,
+            max_generation_length=10,
+            min_generation_length=1,
+            num_beams=5,
+            length_penalty=0,
+            input_ids=input_ids,
+            image_start_index_list=image_start_index_list,
+            image_nums=image_nums,
+        )
+        # postprocess begin
+        new_predictions = [
+            out.strip().lower().strip(string.punctuation+" ") for out in tokenizer.batch_decode(outputs, skip_special_tokens=skip_special_tokens)
+        ]
+        if vqa_dataset == "ok_vqa":
+            new_predictions = postprocessor._lemmatize(new_predictions)
+        if model.special:
+            for i in range(len(new_predictions)):
+                for answer, _ in Counter(batch[i]['answers']).most_common():
+                    if answer in new_predictions[i]:
+                        new_predictions[i] = answer
+                        break
+                    if "cant" in new_predictions[i] and "no" == answer:
+                        new_predictions[i] = answer
+                        break
+                    if "can" in new_predictions[i] and "not" not in new_predictions[i] and "cant" not in new_predictions[i] and "yes" == answer:
+                        new_predictions[i] = answer
+                        break
+        this_tot += 1
+        if rank == 0 and this_tot % 20 == 0:
+            for i in range(1):
+                tqdm.write("model output: " + new_predictions[i])
+        predictions.extend(
+            [
+                {"answer": p, "question_id": sample["question_id"], "_question": sample["question"], "answers": sample["answers"]}
+                for p, sample in zip(new_predictions, batch)
+            ]
+        )
+    with open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps(predictions))
+    print("save to", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json")
+    time.sleep(10)
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        predictions = []
+        for rank_i in range(world_size):
+            print("load", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
+            predictions.extend(json.load(open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")))
+            os.remove(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
+        print("num:", len(predictions))
+        # save the predictions to a temporary file
+        random_uuid = str(uuid.uuid4())
+        with open(f"{vqa_dataset}results_{random_uuid}.json", "w") as f:
+            f.write(json.dumps(predictions, indent=4))
+        if vqa_dataset == "gqa":
+            acc = compute_gqa_accuracy(predictions)
+        else:
+            acc = compute_vqa_accuracy(
+                f"{vqa_dataset}results_{random_uuid}.json",
+                questions_json_path,
+                annotations_json_path,
+                vqa_dataset=vqa_dataset,
+            )
+        print(vqa_dataset, "score:", acc, "| save to", f"{vqa_dataset}results_{random_uuid}.json")
+        os.makedirs("eval_results", exist_ok=True)
+        with open(os.path.join("eval_results", f"{vqa_dataset}_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
+            f.write(json.dumps(predictions, indent=2))
+        # delete the temporary file
+        os.remove(f"{vqa_dataset}results_{random_uuid}.json")
+    else:
+        time.sleep(5)
+        acc = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return acc
+def evaluate_refcoco(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    tsvfile,
+    max_generation_length=20,
+    num_beams=3,
+    length_penalty=-2.0,
+    device=-1,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    model.eval().cuda()
+    loc_token_ids = []
+    for i in range(1000):
+        loc_token_ids.append(int(tokenizer(f"<loc_{i}>", add_special_tokens=False)["input_ids"][-1]))
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+    bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    object_token_id = tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1]
+    # all_ids = set(range(model.lang_encoder.lm_head.out_features))
+    # bad_words_ids = list(all_ids - set(loc_token_ids))
+    # bad_words_ids = [[b] for b in bad_words_ids]
+    # min_loc_token_id = min(loc_token_ids)
+    # max_loc_token_id = max(loc_token_ids)
+    total = 0
+    correct = 0
+    ious = []
+    if "refcocog" in tsvfile:
+        dataset_name = "refcocog"
+    elif "refcocoplus" in tsvfile:
+        dataset_name = "refcocoplus"
+    else:
+        dataset_name = "refcoco"
+    with open(tsvfile, "r") as f:
+        lines = f.readlines()
+        pbar = tqdm(lines, disable=(rank != 0))
+        for ii, line in enumerate(pbar):
+            if ii % world_size != rank:
+                continue
+            total += 1
+            line = line.rstrip()
+            uniq_id, image_id, text, region_coord, image = line.split("\t")
+            image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB")
+            # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/yolo.png").convert("RGB")
+            # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/cat.png").convert("RGB")
+            # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/262148000.png")
+            gt_box = np.array(list(map(float, region_coord.split(","))))
+            width = image.width
+            height = image.height
+            image = image.resize((224, 224))
+            gt_box = gt_box / np.array([width, height, width, height]) * 224
+            batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+            text = text.rstrip('.').strip().replace('"', '').capitalize()
+            prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|>{text}<|#endofobject#|><|#visual#|>"]
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>the cat<|#visual#|>"]
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"]
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>a man<|#visual#|> is doing a trick on a skateboard<|#visual#|>"]
+            encodings = tokenizer(
+                prompt,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=2000,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            # attention_mask[input_ids == prebox_token_id] = 0
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            model.debug_id = 0
+            with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=None,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=None,
+                    add_box=False,
+                )
+            boxes = outputs["boxes"]
+            scores = outputs["scores"]
+            boxes = boxes[scores >= scores[0]*0.5]
+            scores = scores[scores >= scores[0]*0.5]
+            text = text.lower().strip()
+            if text.split(" ")[0] not in ["a", "an", "the", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "several", "some"]:
+                text = "a " + text
+            losses = []
+            for box, score in zip(boxes, scores):
+                this_prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>There is<|#object#|><|#previsual#|><|#prebox#|><|#object#|> {text}"]
+                encodings = tokenizer(
+                    this_prompt,
+                    padding="longest",
+                    truncation=True,
+                    return_tensors="pt",
+                    max_length=2000,
+                )
+                input_ids = encodings["input_ids"]
+                attention_mask = encodings["attention_mask"]
+                image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+                image_start_index_list = [[x] for x in image_start_index_list]
+                image_nums = [1] * len(input_ids)
+                vision_x = batch_images.cuda()
+                lang_x = input_ids.cuda()
+                attention_mask = attention_mask.cuda()
+                added_bbox_list = [torch.tensor(box / 224).cuda().unsqueeze(0).clamp(0, 0.99)]
+                labels = lang_x.clone()
+                start_idx = (lang_x == object_token_id).nonzero()[-1, -1]
+                labels[0, :start_idx+1] = -100
+                with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+                    outputs = model(
+                        vision_x=vision_x,
+                        lang_x=lang_x,
+                        attention_mask=attention_mask,
+                        labels=labels,
+                        image_nums=image_nums,
+                        image_start_index_list=image_start_index_list,
+                        added_bbox_list=added_bbox_list,
+                        add_box=True,
+                    )
+                    # print(tokenizer.decode(outputs.logits[0, start_idx].sort(descending=True).indices[:10]))
+                    loss = outputs.loss.detach().cpu()
+                    losses.append((loss.sum() / (loss != 0).sum()).item())
+            chosen_idx = np.array(losses).argmin()
+            pred_box = boxes[chosen_idx]
+            if chosen_idx != 0:
+                tqdm.write(f"{text}|{chosen_idx}|{scores[chosen_idx]}")
+            iou = get_iou(pred_box, gt_box)
+            if iou >= 0.5:
+                correct += 1
+            # else:
+            #     if rank == 0:
+            #         tqdm.write(text.rstrip('.').strip().lower())
+            #     open_cv_image = np.array(image)
+            #     # Convert RGB to BGR
+            #     open_cv_image = open_cv_image[:, :, ::-1].copy()
+            #     open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2)
+            #     open_cv_image = cv2.rectangle(open_cv_image, gt_box[:2].astype(int), gt_box[2:].astype(int), (0, 255, 0), 2)
+            #     cv2.imwrite(f"refcocog_result/{ii}_{iou}_{text}.jpg", open_cv_image)
+            pbar.set_description(f"iou: {iou:.2f} score: {correct / total:.4f}")
+            # open_cv_image = np.array(image)
+            # # Convert RGB to BGR
+            # open_cv_image = open_cv_image[:, :, ::-1].copy()
+            # for box, score in zip(boxes, scores):
+            #     open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2)
+            # cv2.imwrite("output.jpg", open_cv_image)
+            # print(boxes)
+            # print(scores)
+            # exit()
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([total, correct]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        total = 0
+        correct = 0
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [total_part, correct_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            total += total_part
+            correct += correct_part
+        score = correct / total
+        print("score:", score)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{score}"), "w") as f:
+            pass
+    else:
+        score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+# def preprocess_visual_info(Text):
+#     text = Text.split(" ")
+#     for is_idx, t in enumerate(text):
+#         if t == "is":
+#             break
+#     the_idx = is_idx
+#     while text[the_idx] != "the":
+#         the_idx -= 1
+#     obj_A = " ".join(text[the_idx+1:is_idx])
+#     second_the_idx = len(text) - 1
+#     while text[second_the_idx] != "the":
+#         second_the_idx -= 1
+#     obj_B =  " ".join(text[second_the_idx+1:])
+#     visual_obj_A = f"<|#object#|>{obj_A}<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|>"
+#     visual_obj_B = f"<|#object#|>{obj_B}<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|>"
+#     Text = Text.replace(obj_A, f"<|#object#|>{obj_A}<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|>")
+#     Text = Text.replace(obj_B, f"<|#object#|>{obj_B}<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|>")
+#     return Text, obj_A, obj_B, visual_obj_A, visual_obj_B
+def preprocess_visual_info(Text):
+    text = Text.split(" ")
+    for is_idx, t in enumerate(text):
+        if t == "is":
+            break
+    the_idx = is_idx
+    while text[the_idx] != "the":
+        the_idx -= 1
+    obj_A = " ".join(text[the_idx+1:is_idx])
+    second_the_idx = len(text) - 1
+    while text[second_the_idx] != "the":
+        second_the_idx -= 1
+    obj_B = " ".join(text[second_the_idx+1:])
+    relation = " ".join(text[is_idx+1:second_the_idx])
+    visual_obj_A = f"<|#object#|>the {obj_A}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>"
+    visual_obj_B = f"<|#object#|><|#previsual#|><|#prebox#|><|#object#|>the {obj_B}<|#endofobject#|>"
+    Text = f"{visual_obj_A} is {relation} {visual_obj_B}"
+    return Text, obj_A, visual_obj_A, obj_B, visual_obj_B, relation
+def get_bbox(visual_box_list, batch_images, prompt, model, tokenizer, media_token_id, prebox_token_id, debug=False, return_all=False):
+    assert isinstance(prompt, list) and len(prompt) == 1 and isinstance(prompt[0], str)
+    encodings = tokenizer(
+        prompt,
+        padding="longest",
+        truncation=True,
+        return_tensors="pt",
+        max_length=2000,
+    )
+    input_ids = encodings["input_ids"]
+    attention_mask = encodings["attention_mask"]
+    image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+    image_start_index_list = [[x] for x in image_start_index_list]
+    image_nums = [1] * len(input_ids)
+    vision_x = batch_images.cuda()
+    lang_x = input_ids.cuda()
+    attention_mask = attention_mask.cuda()
+    model.debug_id = 0
+    with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+        outputs = model(
+            vision_x=vision_x,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            labels=None,
+            image_nums=image_nums,
+            image_start_index_list=image_start_index_list,
+            added_bbox_list=visual_box_list,
+            add_box=visual_box_list is not None,
+            relations=None,
+            debug_mode=False,
+        )
+    boxes = outputs["boxes"]
+    scores = outputs["scores"]
+    if debug:
+        import pdb; pdb.set_trace()
+    if return_all:
+        return boxes, scores
+    if len(scores) == 0:
+        return None, None
+    else:
+        return boxes[scores.argmax()], scores.max()
+def evaluate_aro(
+    model,
+    tokenizer,
+    image_processor,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    add_visual=True,
+    subset=False,
+    choose_left_right=False,
+):
+    # os.makedirs(f"visualization/aro_results_{id}", exist_ok=True)
+    dataset_name = "aro"
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+    endofattr_token_id = tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    model.eval().cuda()
+    total = 0
+    n_top1 = 0
+    n_top5 = 0
+    from open_flamingo.eval.dataset_zoo import VG_Relation, VG_Attribution
+    vgr_dataset = VG_Relation(image_preprocess=None, download=True, root_dir="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/vision-language-models-are-bows/data")
+    if subset:
+        subset_idx = json.load(open("aro_subset.json"))
+        pbar = tqdm(subset_idx, disable=(rank != 0))
+    else:
+        pbar = tqdm(vgr_dataset, disable=(rank != 0))
+    for ii, sample in enumerate(pbar):
+        if subset:
+            ORI_IDX = int(sample)
+            sample = vgr_dataset[sample]
+        if ii % world_size != rank:
+            continue
+        image = sample["image_options"][0]
+        # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/yolo.png").convert("RGB")
+        image = image.resize((224, 224))
+        text = sample["caption_options"][1] # 1 is true caption
+        # text = "the dog is sitting on the floor" if idx == 1 else "the floor is sitting on the dog"
+        batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        text, obj_A, visual_obj_A, obj_B, visual_obj_B, relation = preprocess_visual_info(text)
+        first_text = f"<|#object#|>the {obj_A}<|#endofobject#|><|#visual#|>"
+        prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{first_text}"]
+        first_box, first_score = get_bbox(None, batch_images, prompt, model, tokenizer, media_token_id, prebox_token_id, return_all=False)
+        if first_box is None:
+            text_A = "the " + obj_A
+            added_bbox_list = None
+        else:
+            text_A = visual_obj_A
+            added_bbox_list = [torch.tensor(first_box).unsqueeze(0).cuda() / 224]
+        prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text_A} is {relation}<|#object#|><|#previsual#|>"]
+        pre_boxes, pre_scores = get_bbox(added_bbox_list, batch_images, prompt, model, tokenizer, media_token_id,
+        prebox_token_id, return_all=True)
+        if pre_boxes is None:
+            pre_boxes = [np.array([0.0, 0.0, 223.0, 223.0])]
+            pre_scores = [1.0]
+        logits_list = []
+        # pre_boxes = [pre_boxes[0]]
+        # pre_scores = [pre_scores[0]]
+        for pre_box, pre_score in zip(pre_boxes, pre_scores):
+            prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text_A} is {relation}<|#object#|><|#previsual#|><|#prebox#|><|#object#|> the {obj_B}<|#endofobject#|>"]
+            encodings = tokenizer(
+                prompt,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=512,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            labels = lang_x.clone()
+            added_bbox_list = None
+            if add_visual:
+                added_bbox_list = []
+                if first_box is not None:
+                    added_bbox_list.append(torch.tensor(first_box).unsqueeze(0).cuda().float() / 224)
+                if pre_box is not None:
+                    added_bbox_list.append(torch.tensor(pre_box).unsqueeze(0).cuda().float() / 224)
+            if added_bbox_list is not None and len(added_bbox_list) == 0:
+                added_bbox_list = None
+            with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=labels,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=added_bbox_list,
+                    add_box=added_bbox_list is not None,
+                    relations=None,
+                )
+            logits_list.append([pre_score, outputs.logits])
+        pre_scores = np.array([x[0] for x in logits_list])
+        final_probs = 0.0
+        for score, (_, logits) in zip(pre_scores, logits_list):
+            final_probs += score * logits.softmax(-1)
+        assert input_ids.shape[:2] == final_probs.shape[:2]
+        _rank, is_top1, is_top5 = is_correct(input_ids, final_probs, tokenizer, obj_B, topk=5)
+        if is_top1:
+            n_top1 += 1
+        if is_top5:
+            n_top5 += 1
+        total += 1
+        pbar.set_description(f"acc@top1: {n_top1 / total:.4f} | acc@top5: {n_top5 / total:.4f} | {_rank}")
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([total, n_top1, n_top5]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        total = 0
+        n_top1 = 0
+        n_top5 = 0
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [total_part, n_top1_part, n_top5_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            total += total_part
+            n_top1 += n_top1_part
+            n_top5 += n_top5_part
+        acc_top1 = n_top1 / total
+        acc_top5 = n_top5 / total
+        print("acc_top1:", acc_top1, "acc_top5:", acc_top5, "total:", total)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc_top1}_{acc_top5}_{total}_{subset}"), "w") as f:
+            pass
+    else:
+        score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+def evaluate_pisc(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    tsvfile,
+    max_generation_length=20,
+    num_beams=3,
+    length_penalty=-2.0,
+    device=-1,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    add_visual=True,
+):
+    from open_flamingo.train.instruction_template import PISC_TEMPLATES
+    dataset_name = "pisc"
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+    endofattr_token_id = tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    model.train().cuda()
+    dataset = wds.WebDataset("/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/instruct/eval/pisc/000000.tar").decode().to_tuple("image_path.txt", "dataset.txt", "data.pyd")
+    pbar = tqdm(dataset, disable=(rank != 0))
+    rel_id_to_type = ["friends", "family", "couple", "professional", "commercial", "no relation"]
+    rel_type_to_id = {x: i for i, x in enumerate(rel_id_to_type)}
+    gt = []
+    pred_scores = []
+    for III, sample in enumerate(pbar):
+        if III % world_size != rank:
+            continue
+        image_path, dataset, data = sample
+        image = Image.open(image_path)
+        size = image_processor.transforms[0].size
+        image = image.resize((size, size))
+        batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        boxA = data[0]
+        boxB = data[1]
+        gt_relation = data[2]
+        losses = []
+        for i_rel, option_rel in enumerate(rel_id_to_type):
+            text = PISC_TEMPLATES[0].format(relation=option_rel)
+            added_bbox = [
+                torch.tensor([boxA]).cuda(),
+                torch.tensor([boxB]).cuda(),
+            ]
+            caption = f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text}{tokenizer.eos_token}"
+            encodings = tokenizer(
+                caption,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=2000,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            labels = lang_x.clone()
+            labels[labels == tokenizer.pad_token_id] = -100
+            if add_visual:
+                # endofattr_next_token_index = list((labels == endofattr_token_id).nonzero(as_tuple=True))
+                # endofattr_next_token_index[1] += 1
+                # endofattr_next_token_id = labels[endofattr_next_token_index]
+                # </obj><visual><box></attr>NEXT_WORD
+                # </obj> predict NEXT_WORD
+                # <visual><box></attr> predict nothing
+                labels[labels == visual_token_id] = -100
+                labels[labels == box_token_id] = -100
+                labels[labels == endofattr_token_id] = -100
+                # labels[endofattr_next_token_index] = -100
+            labels[:, 0] = -100
+            answer_token_id = tokenizer(" Answer").input_ids[0]
+            answer_token_loc = (input_ids == answer_token_id).nonzero()
+            for batch_idx, idx in answer_token_loc:
+                labels[batch_idx][:idx+2] = -100
+            with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=labels,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=added_bbox,
+                    add_box=added_bbox is not None,
+                )
+                loss_total = outputs.loss.reshape(labels.shape[0], -1)
+                loss = loss_total.sum() / (loss_total != 0).sum()
+                losses.append(loss.item())
+        pred_scores.append(np.exp(-np.array(losses)) / np.exp(-np.array(losses)).sum())
+        gt.append(rel_type_to_id[gt_relation])
+    gt = np.array(gt)
+    pred_scores = np.array(pred_scores)
+    pred = pred_scores.argmax(1)
+    print("total num:", len(gt))
+    recalls = recall_score(y_true=gt, y_pred=pred, average=None, labels=[0,1,2,3,4,5])
+    print("recalls:", recalls)
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([gt.tolist(), pred.tolist()]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        gt = []
+        pred = []
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [gt_part, pred_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            gt.extend(gt_part)
+            pred.extend(pred_part)
+        print("total num:", len(gt))
+        recalls = recall_score(y_true=gt, y_pred=pred, average=None, labels=[0,1,2,3,4,5])
+        print("recalls:", recalls)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}"), "w") as f:
+            f.write(f"{gt}\n")
+            f.write(f"{pred}\n")
+            f.write(f"{recalls}\n")
+    score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+if __name__ == "__main__":
+    main()

multimodal/build/lib/open_flamingo/eval/evaluate_debug.py ADDED Viewed

	@@ -0,0 +1,1159 @@

+import argparse
+import json
+from math import ceil
+import os
+import random
+import uuid
+from collections import defaultdict
+from typing import Callable
+import time
+import cv2
+import more_itertools
+import numpy as np
+import torch
+from coco_metric import compute_cider, postprocess_captioning_generation
+from eval_datasets import VQADataset, GQADataset
+from tqdm import tqdm
+from collections import Counter
+from vqa_metric import compute_vqa_accuracy, compute_gqa_accuracy
+from open_flamingo.eval.classification import (
+    compute_per_sample_probs,
+    compute_per_sample_loss,
+)
+from open_flamingo.eval.imagenet_utils import (
+    openai_imagenet_classnames,
+    IMAGENET_1K_CLASS_ID_TO_LABEL,
+)
+from open_flamingo.src.factory import create_model_and_transforms
+from PIL import Image
+from io import BytesIO
+import base64
+from open_flamingo.train.distributed import init_distributed_device, world_info_from_env
+import string
+from lavis.datasets.builders import load_dataset
+def get_iou(box1, box2):
+    # box1 and box2 should be in the format [x1, y1, x2, y2]
+    intersection = max(0, min(box1[2], box2[2]) - max(box1[0], box2[0])) * \
+                   max(0, min(box1[3], box2[3]) - max(box1[1], box2[1]))
+    area_box1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    area_box2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union = area_box1 + area_box2 - intersection
+    iou = intersection / union if union > 0 else 0
+    return iou
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+parser = argparse.ArgumentParser()
+parser.add_argument("--lm_path", type=str, default="facebook/opt-1.3b")
+parser.add_argument("--lm_tokenizer_path", type=str, default="facebook/opt-30b")
+parser.add_argument("--vision_encoder_path", default="ViT-L-14", type=str)
+parser.add_argument("--vision_encoder_pretrained", default="openai", type=str)
+parser.add_argument("--checkpoint_path", type=str, required=True)
+parser.add_argument(
+    "--results_file", type=str, default=None, help="JSON file to save results"
+)
+# Trial arguments
+parser.add_argument("--shots", nargs="+", default=[0, 4, 8, 16, 32], type=int)
+parser.add_argument(
+    "--num_trials",
+    type=int,
+    default=1,
+    help="Number of trials to run for each shot using different demonstrations",
+)
+parser.add_argument(
+    "--trial_seeds",
+    nargs="+",
+    default=[0],
+    help="Seeds to use for each trial for picking demonstrations and eval sets",
+)
+parser.add_argument(
+    "--num_samples", type=int, default=5000, help="Number of samples to evaluate on"
+)
+parser.add_argument("--batch_size", type=int, default=8)
+# Per-dataset evaluation flags
+parser.add_argument(
+    "--eval_coco",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on COCO.",
+)
+parser.add_argument(
+    "--eval_vqav2",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on VQAV2.",
+)
+parser.add_argument(
+    "--eval_ok_vqa",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on OK-VQA.",
+)
+parser.add_argument(
+    "--eval_imagenet",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on ImageNet.",
+)
+parser.add_argument(
+    "--eval_flickr30",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on Flickr30.",
+)
+parser.add_argument(
+    "--eval_refcoco",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on RefCOCO.",
+)
+# Dataset arguments
+## Flickr30 Dataset
+parser.add_argument(
+    "--flickr_image_dir_path",
+    type=str,
+    help="Path to the flickr30/flickr30k_images directory.",
+    default=None,
+)
+parser.add_argument(
+    "--flickr_annotations_json_path",
+    type=str,
+    help="Path to the dataset_flickr30k_coco_style.json file.",
+    default=None,
+)
+## COCO Dataset
+parser.add_argument(
+    "--coco_image_dir_path",
+    type=str,
+    help="Path to the flickr30/flickr30k_images directory.",
+    default=None,
+)
+parser.add_argument(
+    "--coco_annotations_json_path",
+    type=str,
+    default=None,
+)
+## VQAV2 Dataset
+parser.add_argument(
+    "--vqav2_image_dir_path",
+    type=str,
+    default=None,
+)
+parser.add_argument(
+    "--vqav2_questions_json_path",
+    type=str,
+    default=None,
+)
+parser.add_argument(
+    "--vqav2_annotations_json_path",
+    type=str,
+    default=None,
+)
+## OK-VQA Dataset
+parser.add_argument(
+    "--ok_vqa_image_dir_path",
+    type=str,
+    help="Path to the vqav2/train2014 directory.",
+    default=None,
+)
+parser.add_argument(
+    "--ok_vqa_questions_json_path",
+    type=str,
+    help="Path to the v2_OpenEnded_mscoco_train2014_questions.json file.",
+    default=None,
+)
+parser.add_argument(
+    "--ok_vqa_annotations_json_path",
+    type=str,
+    help="Path to the v2_mscoco_train2014_annotations.json file.",
+    default=None,
+)
+## Imagenet dataset
+parser.add_argument("--imagenet_root", type=str, default="/tmp")
+## RefCOCO dataset
+parser.add_argument("--refcoco_tsvfile", type=str, default=None)
+parser.add_argument(
+    "--location_token_num",
+    default=1000,
+    type=int,
+)
+# distributed training
+parser.add_argument(
+    "--dist-url",
+    default="env://",
+    type=str,
+    help="url used to set up distributed training",
+)
+parser.add_argument(
+    "--dist-backend", default="nccl", type=str, help="distributed backend"
+)
+parser.add_argument(
+    "--horovod",
+    default=False,
+    action="store_true",
+    help="Use horovod for distributed training.",
+)
+parser.add_argument(
+    "--no-set-device-rank",
+    default=False,
+    action="store_true",
+    help="Don't set device index from local rank (when CUDA_VISIBLE_DEVICES restricted to one per proc).",
+)
+parser.add_argument(
+    "--dist",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--lora",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--lora_r",
+    default=16,
+    type=int,
+    required=False,
+)
+parser.add_argument(
+    "--legacy",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--special",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--id",
+    default=0,
+    type=int,
+    required=False,
+)
+parser.add_argument(
+    "--eval_gqa",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--use_sam",
+    default=None,
+    type=str,
+    required=False,
+)
+parser.add_argument(
+    "--add_visual_token",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--use_format_v2",
+    default=False,
+    action="store_true",
+)
+class OKVQAPostProcess():
+    def __init__(self):
+        self._lemmatizer = None
+    def _lemmatize(self, answers):
+        def apply(answer):
+            doc = self.lemmatizer(answer)
+            words = []
+            for token in doc:
+                if token.pos_ in ["NOUN", "VERB"]:
+                    words.append(token.lemma_)
+                else:
+                    words.append(token.text)
+            answer = " ".join(words)
+            return answer
+        return [apply(answer) for answer in answers]
+    @property
+    def lemmatizer(self):
+        if self._lemmatizer is None:
+            try:
+                import spacy
+                self._lemmatizer = spacy.load("en_core_web_sm")
+            except ImportError:
+                logging.error(
+                    """
+                    Please install spacy and en_core_web_sm model to apply lemmatization.
+                    python -m spacy download en_core_web_sm
+                    OR
+                    import spacy.cli
+                    spacy.cli.download("en_core_web_sm")
+                    """
+                )
+                exit(1)
+        return self._lemmatizer
+def main():
+    args = parser.parse_args()
+    if args.dist:
+        args.local_rank, args.rank, args.world_size = world_info_from_env()
+        print(f"local_rank: {args.local_rank} rank: {args.rank} world_size: {args.world_size}")
+        device_id = init_distributed_device(args)
+    else:
+        args.rank = 0
+        args.world_size = 1
+        print(f"rank: {args.rank} world_size: {args.world_size}")
+    if "sam" in args.checkpoint_path:
+        args.use_sam = "vit_l"
+    args.add_visual_token = True
+    if "lora" in args.checkpoint_path:
+        args.lora = True
+    args.add_pe = False
+    args.add_box = False
+    args.relation = False
+    if "debug" in args.checkpoint_path:
+        # args.add_pe = True
+        args.add_box = True
+    if "box" in args.checkpoint_path:
+        args.add_box = True
+    if "pe" in args.checkpoint_path:
+        args.add_pe = True
+    if "rel" in args.checkpoint_path:
+        args.relation = True
+        args.add_pe = False
+    if "previsual" in args.checkpoint_path:
+        args.use_format_v2 = True
+        args.relation = False
+    # load model
+    flamingo, image_processor, tokenizer, vis_embed_size = create_model_and_transforms(
+        args.vision_encoder_path,
+        args.vision_encoder_pretrained,
+        args.lm_path,
+        args.lm_tokenizer_path,
+        location_token_num=args.location_token_num,
+        lora=args.lora,
+        lora_r=16,
+        use_sam=args.use_sam,
+        add_visual_token=args.add_visual_token,
+        use_format_v2=args.use_format_v2,
+        add_box=args.add_box,
+        add_pe=args.add_pe,
+        add_relation=args.relation,
+    )
+    flamingo.use_format_v2 = args.use_format_v2
+    if args.special:
+        flamingo.special = True
+    else:
+        flamingo.special = False
+    if args.legacy:
+        flamingo.legacy = True
+        print("use legacy evaluation")
+    flamingo.step_num = int(args.checkpoint_path.split("/")[-1].split(".")[0].split("_")[-1])
+    flamingo.expr_name = args.checkpoint_path.split("/")[-2]
+    if args.rank == 0:
+        print("legacy", True if hasattr(flamingo, "legacy") else False)
+        print("step:", flamingo.step_num)
+        print("expr:", flamingo.expr_name)
+        print("use format v2:", flamingo.use_format_v2)
+        print(args)
+    checkpoint = torch.load(args.checkpoint_path, map_location="cpu")
+    model_state_dict = {}
+    for key in checkpoint["model_state_dict"].keys():
+        model_state_dict[key.replace("module.", "")] = checkpoint["model_state_dict"][key]
+    if "vision_encoder.logit_scale"in model_state_dict:
+        # previous checkpoint has some unnecessary weights
+        del model_state_dict["vision_encoder.logit_scale"]
+        del model_state_dict["vision_encoder.visual.proj"]
+        del model_state_dict["vision_encoder.visual.ln_post.weight"]
+        del model_state_dict["vision_encoder.visual.ln_post.bias"]
+    flamingo.load_state_dict(model_state_dict, strict=True)
+    results = defaultdict(list)
+    if args.eval_coco:
+        print("Evaluating on COCO...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                cider_score = evaluate_coco_flickr(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    image_dir_path=args.coco_image_dir_path,
+                    annotations_json_path=args.coco_annotations_json_path,
+                    device=args.device,
+                    seed=seed,
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+                print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}")
+                scores.append(cider_score)
+            print(f"Shots {shot} Mean CIDEr score: {np.mean(scores)}")
+            results["coco"].append(
+                {"shots": shot, "trials": scores, "mean": np.mean(scores)}
+            )
+    if args.eval_ok_vqa:
+        print("Evaluating on OK-VQA...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                ok_vqa_score = evaluate_vqa(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    image_dir_path=args.ok_vqa_image_dir_path,
+                    questions_json_path=args.ok_vqa_questions_json_path,
+                    annotations_json_path=args.ok_vqa_annotations_json_path,
+                    vqa_dataset="ok_vqa",
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+            results["ok_vqa"].append(
+                {"shots": shot, "score": ok_vqa_score}
+            )
+    if args.eval_vqav2:
+        print("Evaluating on VQAv2...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                vqa_score = evaluate_vqa(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    image_dir_path=args.vqav2_image_dir_path,
+                    questions_json_path=args.vqav2_questions_json_path,
+                    annotations_json_path=args.vqav2_annotations_json_path,
+                    vqa_dataset="vqa",
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+            results["vqav2"].append(
+                {"shots": shot, "score": vqa_score}
+            )
+    if args.eval_gqa:
+        print("Evaluating on GQA...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                vqa_score = evaluate_vqa(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    vqa_dataset="gqa",
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+            results["gqa"].append(
+                {"shots": shot, "score": vqa_score}
+            )
+    if args.eval_imagenet:
+        print("Evaluating on ImageNet...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                imagenet_score = evaluate_imagenet(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    num_samples=args.num_samples,
+                    num_shots=shot,
+                    device=args.device,
+                    seed=seed,
+                    imagenet_root=args.imagenet_root,
+                )
+                print(
+                    f"Shots {shot} Trial {trial} " f"ImageNet score: {imagenet_score}"
+                )
+                scores.append(imagenet_score)
+            print(f"Shots {shot} Mean ImageNet score: {np.mean(scores)}")
+            results["imagenet"].append(
+                {"shots": shot, "trials": scores, "mean": np.mean(scores)}
+            )
+    if args.eval_refcoco:
+        print("Evaluating on RefCOCO...")
+        refcoco_score = evaluate_refcoco(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            batch_size=args.batch_size,
+            device=args.device,
+            tsvfile=args.refcoco_tsvfile,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["refcoco"].append(
+            {"score": refcoco_score}
+        )
+def prepare_batch_images(batch, image_processor):
+    batch_images = None
+    for b in batch:
+        b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        if batch_images is None:
+            batch_images = b_image
+        else:
+            batch_images = torch.cat([batch_images, b_image], dim=0)
+    return batch_images
+def get_outputs(
+    model,
+    batch_images,
+    attention_mask,
+    max_generation_length,
+    min_generation_length,
+    num_beams,
+    length_penalty,
+    input_ids,
+    image_start_index_list=None,
+    image_nums=None,
+    bad_words_ids=None,
+):
+    with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+        outputs = model.generate(
+            batch_images,
+            input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_generation_length,
+            min_length=min_generation_length,
+            num_beams=num_beams,
+            length_penalty=length_penalty,
+            image_start_index_list=image_start_index_list,
+            image_nums=image_nums,
+            bad_words_ids=bad_words_ids,
+        )
+    outputs = outputs[:, len(input_ids[0]) :]
+    return outputs
+def evaluate_coco_flickr(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    image_dir_path,
+    annotations_json_path,
+    seed=42,
+    max_generation_length=20,
+    num_beams=1,
+    length_penalty=-2.0,
+    device=-1,
+    is_flickr=False,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    """Evaluate a model on COCO dataset.
+    Args:
+        model (nn.Module): model to evaluate
+        tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model
+        image_processor : image processor for the model
+        batch_size (int): batch size
+        image_dir_path (str, optional): path to the directory containing the images.
+        annotations_json_path (str, optional): path to the json file containing the annotations.
+        seed (int, optional): seed for random number generator. Defaults to 42.
+        max_generation_length (int, optional): maximum length of the generated caption. Defaults to 10.
+        num_beams (int, optional): number of beams to use for beam search. Defaults to 3.
+        length_penalty (float, optional): length penalty for beam search. Defaults to -2.0.
+        num_samples (int, optional): number of samples to evaluate on. Defaults to 5000.
+        query_set_size (int, optional): number of samples to use for query set. Defaults to 2048.
+        num_shots (int, optional): number of in-context samples to use. Defaults to 8.
+        device (int, optional): device to use. Defaults to -1.
+        num_workers (int, optional): number of workers to use for dataloader. Defaults to 4.
+        is_flickr (bool): defines if that data is COCO or Flickr. Defaults to False (COCO).
+    Returns:
+        float: CIDEr score
+    """
+    # eval_dataset = COCOFlickrDataset(
+    #     image_dir_path=image_dir_path,
+    #     annotations_path=annotations_json_path,
+    #     is_flickr=is_flickr,
+    # )
+    coco_dataset = load_dataset("coco_caption")
+    eval_dataset = coco_dataset["test"]
+    model.eval().cuda()
+    predictions = defaultdict()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    # if "peft" in lang_encoder_name:
+        # lang_encoder_name = model.lang_encoder.base_model.model.__class__.__name__.lower()
+    try:
+        media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+        endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+        pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+        bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    except:
+        pass
+    def get_prompt(sample):
+        return f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"
+    tokenizer.padding_side = "left"
+    cnt = 0
+    if world_size > 1:
+        torch.distributed.barrier()
+    desc = "Running inference Flickr30" if is_flickr else "Running inference COCO"
+    for ii, batch in enumerate(more_itertools.chunked(
+        tqdm(eval_dataset, desc=desc, disable=(rank != 0)), batch_size
+    )):
+        if ii % world_size != rank:
+            continue
+        cnt += len(batch)
+        batch_images = prepare_batch_images(
+            batch=batch,
+            image_processor=image_processor,
+        ).cuda()
+        batch_text = [get_prompt(s) for s in batch]
+        encodings = tokenizer(
+            batch_text,
+            padding="longest",
+            truncation=True,
+            return_tensors="pt",
+            max_length=2000,
+        )
+        input_ids = encodings["input_ids"].cuda()
+        attention_mask = encodings["attention_mask"].cuda()
+        skip_special_tokens = False
+        if hasattr(model, "legacy") and model.legacy and "opt" in lang_encoder_name:
+            if rank == 0:
+                tqdm.write("use legacy model")
+            skip_special_tokens = True
+            for i in range(len(input_ids)):
+                media_token_index = (input_ids[i] == media_token_id).nonzero()[0,0]
+                endofmedia_token_index = (input_ids[i] == endofmedia_token_id).nonzero()[0,0]
+                input_ids[i, media_token_index - 1] = media_token_id
+                input_ids[i, media_token_index] = pad_token_id
+                input_ids[i, endofmedia_token_index - 1] = endofmedia_token_id
+                input_ids[i, endofmedia_token_index] = bos_token_id
+        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+        image_start_index_list = [[x] for x in image_start_index_list]
+        image_nums = [1] * len(input_ids)
+        if "llama" in lang_encoder_name:
+            attention_mask[input_ids == 0] = 0
+        outputs = get_outputs(
+            model=model,
+            batch_images=batch_images,
+            attention_mask=attention_mask,
+            max_generation_length=30,
+            min_generation_length=8,
+            num_beams=5,
+            length_penalty=0,
+            input_ids=input_ids,
+            image_start_index_list=image_start_index_list,
+            image_nums=image_nums,
+        )
+        new_predictions = [
+            postprocess_captioning_generation(out).replace('"', "")
+            for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ]
+        # if rank == 0:
+        #     tqdm.write(f"{batch_images.shape} {batch[0]} pred: {new_predictions[0]}")
+        for i, sample in enumerate(batch):
+            predictions[int(sample["image_id"])] = {
+                "caption": new_predictions[i],
+            }
+    results_path = (
+        f"flickrresults_{lang_encoder_name}_{rank}_{id}.json"
+        if is_flickr
+        else f"cocoresults_{lang_encoder_name}_{rank}_{id}.json"
+    )
+    with open(results_path, "w") as f:
+        f.write(
+            json.dumps(
+                [
+                    {"image_id": k, "caption": predictions[k]["caption"]}
+                    for k in predictions
+                ],
+                indent=2,
+            )
+        )
+    print("save to", results_path)
+    del predictions
+    time.sleep(10)
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        predictions = []
+        for rank_i in range(world_size):
+            part_results_path = (
+                f"flickrresults_{lang_encoder_name}_{rank_i}_{id}.json"
+                if is_flickr
+                else f"cocoresults_{lang_encoder_name}_{rank_i}_{id}.json"
+            )
+            print("load", part_results_path)
+            predictions.extend(json.load(open(part_results_path)))
+            os.remove(part_results_path)
+        print("num:", len(predictions))
+        results_path = (
+            f"flickrresults_{lang_encoder_name}.json"
+            if is_flickr
+            else f"cocoresults_{lang_encoder_name}.json"
+        )
+        json.dump(predictions, open(results_path, "w"), indent=2)
+        metrics = compute_cider(
+            result_path=results_path,
+            annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/.cache/lavis/coco_gt/coco_karpathy_test_gt.json",
+        )
+        os.makedirs("eval_results", exist_ok=True)
+        acc = metrics["CIDEr"]
+        with open(os.path.join("eval_results", f"cococap_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
+            f.write(json.dumps(predictions, indent=2))
+        # delete the temporary file
+        os.remove(results_path)
+    else:
+        metrics = {}
+        metrics["CIDEr"] = 0.0
+    return metrics["CIDEr"]
+def evaluate_vqa(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    image_dir_path=None,
+    questions_json_path=None,
+    annotations_json_path=None,
+    vqa_dataset="vqa",
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    """
+    Evaluate a model on VQA datasets. Currently supports VQA v2.0.
+    Args:
+        model (nn.Module): model to evaluate
+        tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model
+        image_processor : image processor for the model
+        batch_size (int): batch size
+        image_dir_path (str): path to image directory
+        questions_json_path (str): path to questions json file
+        annotations_json_path (str): path to annotations json file
+        seed (int, optional): random seed. Defaults to 42.
+        max_generation_length (int, optional): max generation length. Defaults to 5.
+        num_beams (int, optional): number of beams to use for beam search. Defaults to 3.
+        length_penalty (float, optional): length penalty for beam search. Defaults to -2.0.
+        num_samples (int, optional): number of samples to evaluate on. Defaults to 5000 samples.
+        query_set_size (int, optional): size of the query set. Defaults to 2048.
+        num_shots (int, optional): number of shots to use. Defaults to 8.
+        device (int, optional): device to use. Defaults to -1 (cpu).
+        num_workers (int, optional): number of workers to use. Defaults to 4.
+        vqa_dataset (string): type of vqa dataset: currently supports vqa, ok_vqa. Defaults to vqa.
+    Returns:
+        float: accuracy score
+    """
+    if world_size > 1:
+        torch.distributed.barrier()
+    if vqa_dataset == "gqa":
+        eval_dataset = GQADataset()
+    else:
+        eval_dataset = VQADataset(
+            image_dir_path=image_dir_path,
+            question_path=questions_json_path,
+            annotations_path=annotations_json_path,
+            vqa_dataset=vqa_dataset,
+        )
+    postprocessor = OKVQAPostProcess()
+    try:
+        media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+        endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+        pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+        bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    except:
+        pass
+    def get_prompt(sample):
+        return f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:"
+        # return f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"
+    model.eval().cuda()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    if "peft" in lang_encoder_name:
+        lang_encoder_name = model.lang_encoder.base_model.model.__class__.__name__.lower()
+    predictions = []
+    tokenizer.padding_side = "left"
+    if world_size > 1:
+        torch.distributed.barrier()
+    for ii, batch in enumerate(more_itertools.chunked(
+        tqdm(eval_dataset, desc="Running inference", disable=(rank != 0)), batch_size
+    )):
+        if ii % world_size != rank:
+            continue
+        batch_images = prepare_batch_images(
+            batch=batch,
+            image_processor=image_processor,
+        ).cuda()
+        batch_text = [get_prompt(s) for s in batch]
+        encodings = tokenizer(
+            batch_text,
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            max_length=2000,
+        )
+        input_ids = encodings["input_ids"].cuda()
+        attention_mask = encodings["attention_mask"].cuda()
+        skip_special_tokens = True
+        if hasattr(model, "legacy") and model.legacy and "opt" in lang_encoder_name:
+            if rank == 0:
+                tqdm.write("use legacy model")
+            for i in range(len(input_ids)):
+                media_token_index = (input_ids[i] == media_token_id).nonzero()[0,0]
+                endofmedia_token_index = (input_ids[i] == endofmedia_token_id).nonzero()[0,0]
+                input_ids[i, media_token_index - 1] = media_token_id
+                input_ids[i, media_token_index] = pad_token_id
+                input_ids[i, endofmedia_token_index - 1] = endofmedia_token_id
+                input_ids[i, endofmedia_token_index] = bos_token_id
+        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+        image_start_index_list = [[x] for x in image_start_index_list]
+        image_nums = [1] * len(input_ids)
+        if "llama" in lang_encoder_name:
+            attention_mask[input_ids == 0] = 0
+        outputs = get_outputs(
+            model=model,
+            batch_images=batch_images,
+            attention_mask=attention_mask,
+            max_generation_length=10,
+            min_generation_length=1,
+            num_beams=5,
+            length_penalty=0,
+            input_ids=input_ids,
+            image_start_index_list=image_start_index_list,
+            image_nums=image_nums,
+        )
+        # postprocess begin
+        new_predictions = [
+            out.strip().lower().strip(string.punctuation+" ") for out in tokenizer.batch_decode(outputs, skip_special_tokens=skip_special_tokens)
+        ]
+        if vqa_dataset == "ok_vqa":
+            new_predictions = postprocessor._lemmatize(new_predictions)
+        if model.special:
+            for i in range(len(new_predictions)):
+                for answer, _ in Counter(batch[i]['answers']).most_common():
+                    if answer in new_predictions[i]:
+                        new_predictions[i] = answer
+                        break
+                    if "cant" in new_predictions[i] and "no" == answer:
+                        new_predictions[i] = answer
+                        break
+                    if "can" in new_predictions[i] and "not" not in new_predictions[i] and "cant" not in new_predictions[i] and "yes" == answer:
+                        new_predictions[i] = answer
+                        break
+        # if rank == 0:
+        #     tqdm.write(f"{image_nums} {image_start_index_list}")
+        #     for i in range(1):
+        #         tqdm.write(f"ID: {batch[i]['question_id']} | gt QA: {batch[i]['question']} {Counter(batch[i]['answers']).most_common()}")
+        #         tqdm.write("prompt: " + tokenizer.decode(input_ids[i]))
+        #         tqdm.write("model output: " + new_predictions[i])
+        predictions.extend(
+            [
+                {"answer": p, "question_id": sample["question_id"], "_question": sample["question"], "answers": sample["answers"]}
+                for p, sample in zip(new_predictions, batch)
+            ]
+        )
+    with open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps(predictions))
+    print("save to", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json")
+    time.sleep(10)
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        predictions = []
+        for rank_i in range(world_size):
+            print("load", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
+            predictions.extend(json.load(open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")))
+            os.remove(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
+        print("num:", len(predictions))
+        # save the predictions to a temporary file
+        random_uuid = str(uuid.uuid4())
+        with open(f"{vqa_dataset}results_{random_uuid}.json", "w") as f:
+            f.write(json.dumps(predictions, indent=4))
+        if vqa_dataset == "gqa":
+            acc = compute_gqa_accuracy(predictions)
+        else:
+            acc = compute_vqa_accuracy(
+                f"{vqa_dataset}results_{random_uuid}.json",
+                questions_json_path,
+                annotations_json_path,
+                vqa_dataset=vqa_dataset,
+            )
+        print(vqa_dataset, "score:", acc, "| save to", f"{vqa_dataset}results_{random_uuid}.json")
+        os.makedirs("eval_results", exist_ok=True)
+        with open(os.path.join("eval_results", f"{vqa_dataset}_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
+            f.write(json.dumps(predictions, indent=2))
+        # delete the temporary file
+        os.remove(f"{vqa_dataset}results_{random_uuid}.json")
+    else:
+        time.sleep(5)
+        acc = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return acc
+def evaluate_refcoco(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    tsvfile,
+    max_generation_length=20,
+    num_beams=3,
+    length_penalty=-2.0,
+    device=-1,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    model.eval().cuda()
+    loc_token_ids = []
+    for i in range(1000):
+        loc_token_ids.append(int(tokenizer(f"<loc_{i}>", add_special_tokens=False)["input_ids"][-1]))
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    total = 0
+    correct = 0
+    ious = []
+    if "refcocog" in tsvfile:
+        dataset_name = "refcocog"
+    elif "refcocoplus" in tsvfile:
+        dataset_name = "refcocoplus"
+    else:
+        dataset_name = "refcoco"
+    with open(tsvfile, "r") as f:
+        lines = f.readlines()
+        pbar = tqdm(lines, disable=(rank != 0))
+        for ii, line in enumerate(pbar):
+            if ii % world_size != rank:
+                continue
+            total += 1
+            line = line.rstrip()
+            uniq_id, image_id, text, region_coord, image = line.split("\t")
+            # image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB")
+            # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/cat.png").convert("RGB")
+            # image2 = Image.open("yolo.png").convert("RGB")
+            # image1 = image1.resize((224, 224))
+            # image2 = image2.resize((224, 224))
+            # images = [image1, image2]
+            # gt_box = np.array(list(map(float, region_coord.split(","))))
+            # width = image.width
+            # height = image.height
+            # gt_box /= np.array([width, height, width, height])
+            # batch_images = [image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0) for image in images]
+            # batch_images = torch.cat(batch_images, dim=0)
+            # image = Image.open("yolo_test.png").convert("RGB")
+            image = Image.open("example.png").convert("RGB")
+            image = image.resize((224, 224))
+            batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text.rstrip('.')}<|#visual#|>"]
+            prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|><|#previsual#|><|#prebox#|><|#endofattr#|>man<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|> is sitting on<|#object#|><|#previsual#|>"]
+            # prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|><|#previsual#|>man<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|> is sitting on<|#object#|><|#previsual#|>"]
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"]
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>a man<|#visual#|> is doing a trick on a skateboard<|#visual#|>"]
+            encodings = tokenizer(
+                prompt,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=2000,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [image_start_index_list]
+            image_nums = [1]
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            print(image_start_index_list, image_nums)
+            model.debug_id = 0
+            # outputs = get_outputs(
+            #     model=model,
+            #     batch_images=vision_x,
+            #     attention_mask=attention_mask,
+            #     max_generation_length=20,
+            #     min_generation_length=8,
+            #     num_beams=5,
+            #     length_penalty=0,
+            #     input_ids=lang_x,
+            #     image_start_index_list=image_start_index_list,
+            #     image_nums=image_nums,
+            # )
+            # print(tokenizer.decode(outputs[0]))
+            # exit()
+            prebox = [93, 20, 155, 172] # man
+            # prebox = [32, 82, 89, 213] # dog
+            # prebox = [34, 49, 166, 164] # bike
+            with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=None,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=[torch.tensor(prebox).cuda().unsqueeze(0) / 224],
+                    add_box=True,
+                    debug_mode=True,
+                )
+            boxes = outputs["boxes"]
+            scores = outputs["scores"]
+            box = boxes[scores.argmax()]
+            open_cv_image = np.array(image)
+            # Convert RGB to BGR
+            open_cv_image = open_cv_image[:, :, ::-1].copy()
+            open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2)
+            open_cv_image = cv2.rectangle(open_cv_image, prebox[:2], prebox[2:], (0, 0, 255), 2)
+            cv2.imwrite(f"output2.jpg", open_cv_image)
+            print(box)
+            print(prebox)
+            exit()
+            # force_words = ["man", "table"]
+            # force_words_ids = tokenizer(force_words, add_special_tokens=False).input_ids
+            # sequences, hidden_states_for_each_step = get_outputs(
+            #     model=model,
+            #     batch_images=vision_x,
+            #     attention_mask=attention_mask,
+            #     max_generation_length=20,
+            #     min_generation_length=8,
+            #     num_beams=5,
+            #     length_penalty=0,
+            #     input_ids=lang_x,
+            #     image_start_index_list=image_start_index_list,
+            #     image_nums=image_nums,
+            #     force_words_ids=force_words_ids,
+            # )
+            # sequence = sequences[0]
+            # print(tokenizer.decode(sequence))
+            # for i, token in enumerate(sequence):
+            #     if token == model.visual_token_id:
+            #         print(tokenizer.decode(sequence[:i+1]))
+            #         if hasattr(model, "debug_id"):
+            #             model.debug_id += 1
+            #         else:
+            #             model.debug_id = 0
+            #         this_lang_x = torch.hstack([lang_x[0], sequence[:i+1]]).unsqueeze(0)
+            #         this_attention_mask = torch.ones_like(this_lang_x).cuda()
+            #         with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+            #             _ = model(
+            #                 vision_x=vision_x,
+            #                 lang_x=this_lang_x,
+            #                 attention_mask=this_attention_mask,
+            #                 labels=None,
+            #                 image_nums=image_nums,
+            #                 image_start_index_list=image_start_index_list,
+            #                 added_bbox_list=None,
+            #             )
+            # exit()
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([total, correct]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        total = 0
+        correct = 0
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [total_part, correct_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            total += total_part
+            correct += correct_part
+        score = correct / total
+        print("score:", score)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{score}"), "w") as f:
+            pass
+    else:
+        score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+if __name__ == "__main__":
+    main()

multimodal/build/lib/open_flamingo/eval/evaluate_find_showcase.py ADDED Viewed

	@@ -0,0 +1,1700 @@

+import argparse
+import json
+from math import ceil
+import os
+import random
+import uuid
+from collections import defaultdict
+from typing import Callable
+import time
+import cv2
+import webdataset as wds
+from sklearn.metrics import recall_score, average_precision_score
+import more_itertools
+import numpy as np
+import torch
+from coco_metric import compute_cider, postprocess_captioning_generation
+from eval_datasets import VQADataset
+from tqdm import tqdm
+from collections import Counter
+from vqa_metric import compute_vqa_accuracy, compute_gqa_accuracy
+from open_flamingo.eval.classification import (
+    compute_per_sample_probs,
+    compute_per_sample_loss,
+)
+from open_flamingo.eval.imagenet_utils import (
+    openai_imagenet_classnames,
+    IMAGENET_1K_CLASS_ID_TO_LABEL,
+)
+from open_flamingo.src.factory import create_model_and_transforms
+from PIL import Image
+from io import BytesIO
+import base64
+from open_flamingo.train.distributed import init_distributed_device, world_info_from_env
+import string
+from lavis.datasets.builders import load_dataset
+from open_flamingo.eval.task.reg import evaluate_reg
+from open_flamingo.eval.task.gqa import GQADataset
+from open_flamingo.eval.task.vl_checklist import evaluate_vlc
+from open_flamingo.eval.task.crepe import evaluate_crepe
+def get_iou(box1, box2):
+    # box1 and box2 should be in the format [x1, y1, x2, y2]
+    intersection = max(0, min(box1[2], box2[2]) - max(box1[0], box2[0])) * \
+                   max(0, min(box1[3], box2[3]) - max(box1[1], box2[1]))
+    area_box1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    area_box2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union = area_box1 + area_box2 - intersection
+    iou = intersection / union if union > 0 else 0
+    return iou
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+parser = argparse.ArgumentParser()
+parser.add_argument("--lm_path", type=str, default="facebook/opt-1.3b")
+parser.add_argument("--lm_tokenizer_path", type=str, default="facebook/opt-30b")
+parser.add_argument("--vision_encoder_path", default="ViT-L-14", type=str)
+parser.add_argument("--vision_encoder_pretrained", default="openai", type=str)
+parser.add_argument("--checkpoint_path", type=str, required=True)
+parser.add_argument(
+    "--results_file", type=str, default=None, help="JSON file to save results"
+)
+# Trial arguments
+parser.add_argument("--shots", nargs="+", default=[0, 4, 8, 16, 32], type=int)
+parser.add_argument(
+    "--num_trials",
+    type=int,
+    default=1,
+    help="Number of trials to run for each shot using different demonstrations",
+)
+parser.add_argument(
+    "--trial_seeds",
+    nargs="+",
+    default=[0],
+    help="Seeds to use for each trial for picking demonstrations and eval sets",
+)
+parser.add_argument(
+    "--num_samples", type=int, default=5000, help="Number of samples to evaluate on"
+)
+parser.add_argument("--batch_size", type=int, default=8)
+# Per-dataset evaluation flags
+parser.add_argument(
+    "--eval_coco",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on COCO.",
+)
+parser.add_argument(
+    "--eval_vqav2",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on VQAV2.",
+)
+parser.add_argument(
+    "--eval_ok_vqa",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on OK-VQA.",
+)
+parser.add_argument(
+    "--eval_imagenet",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on ImageNet.",
+)
+parser.add_argument(
+    "--eval_flickr30",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on Flickr30.",
+)
+parser.add_argument(
+    "--eval_refcoco",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on RefCOCO.",
+)
+# Dataset arguments
+## Flickr30 Dataset
+parser.add_argument(
+    "--flickr_image_dir_path",
+    type=str,
+    help="Path to the flickr30/flickr30k_images directory.",
+    default=None,
+)
+parser.add_argument(
+    "--flickr_annotations_json_path",
+    type=str,
+    help="Path to the dataset_flickr30k_coco_style.json file.",
+    default=None,
+)
+## COCO Dataset
+parser.add_argument(
+    "--coco_image_dir_path",
+    type=str,
+    help="Path to the flickr30/flickr30k_images directory.",
+    default=None,
+)
+parser.add_argument(
+    "--coco_annotations_json_path",
+    type=str,
+    default=None,
+)
+## VQAV2 Dataset
+parser.add_argument(
+    "--vqav2_image_dir_path",
+    type=str,
+    default=None,
+)
+parser.add_argument(
+    "--vqav2_questions_json_path",
+    type=str,
+    default=None,
+)
+parser.add_argument(
+    "--vqav2_annotations_json_path",
+    type=str,
+    default=None,
+)
+## OK-VQA Dataset
+parser.add_argument(
+    "--ok_vqa_image_dir_path",
+    type=str,
+    help="Path to the vqav2/train2014 directory.",
+    default=None,
+)
+parser.add_argument(
+    "--ok_vqa_questions_json_path",
+    type=str,
+    help="Path to the v2_OpenEnded_mscoco_train2014_questions.json file.",
+    default=None,
+)
+parser.add_argument(
+    "--ok_vqa_annotations_json_path",
+    type=str,
+    help="Path to the v2_mscoco_train2014_annotations.json file.",
+    default=None,
+)
+## Imagenet dataset
+parser.add_argument("--imagenet_root", type=str, default="/tmp")
+## RefCOCO dataset
+parser.add_argument("--refcoco_tsvfile", type=str, default=None)
+parser.add_argument(
+    "--location_token_num",
+    default=1000,
+    type=int,
+)
+# distributed training
+parser.add_argument(
+    "--dist-url",
+    default="env://",
+    type=str,
+    help="url used to set up distributed training",
+)
+parser.add_argument(
+    "--dist-backend", default="nccl", type=str, help="distributed backend"
+)
+parser.add_argument(
+    "--horovod",
+    default=False,
+    action="store_true",
+    help="Use horovod for distributed training.",
+)
+parser.add_argument(
+    "--no-set-device-rank",
+    default=False,
+    action="store_true",
+    help="Don't set device index from local rank (when CUDA_VISIBLE_DEVICES restricted to one per proc).",
+)
+parser.add_argument(
+    "--dist",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--lora",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--lora_r",
+    default=16,
+    type=int,
+    required=False,
+)
+parser.add_argument(
+    "--legacy",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--special",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--id",
+    default=0,
+    type=int,
+    required=False,
+)
+parser.add_argument(
+    "--eval_gqa",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--use_sam",
+    default=None,
+    type=str,
+    required=False,
+)
+parser.add_argument(
+    "--add_visual_token",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--use_format_v2",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_aro",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_pisc",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_reg",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_vlc",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_crepe",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--level",
+    default=4,
+    type=int,
+)
+parser.add_argument(
+    "--type",
+    default="swap",
+    type=str,
+)
+class OKVQAPostProcess():
+    def __init__(self):
+        self._lemmatizer = None
+    def _lemmatize(self, answers):
+        def apply(answer):
+            doc = self.lemmatizer(answer)
+            words = []
+            for token in doc:
+                if token.pos_ in ["NOUN", "VERB"]:
+                    words.append(token.lemma_)
+                else:
+                    words.append(token.text)
+            answer = " ".join(words)
+            return answer
+        return [apply(answer) for answer in answers]
+    @property
+    def lemmatizer(self):
+        if self._lemmatizer is None:
+            try:
+                import spacy
+                self._lemmatizer = spacy.load("en_core_web_sm")
+            except ImportError:
+                logging.error(
+                    """
+                    Please install spacy and en_core_web_sm model to apply lemmatization.
+                    python -m spacy download en_core_web_sm
+                    OR
+                    import spacy.cli
+                    spacy.cli.download("en_core_web_sm")
+                    """
+                )
+                exit(1)
+        return self._lemmatizer
+def main():
+    args = parser.parse_args()
+    if args.dist:
+        args.local_rank, args.rank, args.world_size = world_info_from_env()
+        print(f"local_rank: {args.local_rank} rank: {args.rank} world_size: {args.world_size}")
+        device_id = init_distributed_device(args)
+    else:
+        args.rank = 0
+        args.world_size = 1
+        print(f"rank: {args.rank} world_size: {args.world_size}")
+    if "sam" in args.checkpoint_path:
+        args.use_sam = "vit_l"
+    args.add_visual_token = True
+    if "lora" in args.checkpoint_path:
+        args.lora = True
+    args.add_pe = False
+    args.add_box = True
+    args.relation = False
+    args.enhance_data = False
+    args.use_format_v2 = True
+    import hashlib
+    args.id = hashlib.sha224(args.checkpoint_path.encode()).hexdigest()
+    # load model
+    flamingo, image_processor, tokenizer, vis_embed_size = create_model_and_transforms(
+        args.vision_encoder_path,
+        args.vision_encoder_pretrained,
+        args.lm_path,
+        args.lm_tokenizer_path,
+        location_token_num=args.location_token_num,
+        lora=args.lora,
+        lora_r=16,
+        use_sam=args.use_sam,
+        add_visual_token=args.add_visual_token,
+        use_format_v2=args.use_format_v2,
+        add_box=args.add_box,
+        add_pe=args.add_pe,
+        add_relation=args.relation,
+        enhance_data=args.enhance_data,
+    )
+    flamingo.use_format_v2 = args.use_format_v2
+    if args.special:
+        flamingo.special = True
+    else:
+        flamingo.special = False
+    if args.legacy:
+        flamingo.legacy = True
+        print("use legacy evaluation")
+    flamingo.step_num = int(args.checkpoint_path.split("/")[-1].split(".")[0].split("_")[-1])
+    flamingo.expr_name = args.checkpoint_path.split("/")[-2]
+    if args.rank == 0:
+        print("legacy", True if hasattr(flamingo, "legacy") else False)
+        print("step:", flamingo.step_num)
+        print("expr:", flamingo.expr_name)
+        print("use format v2:", flamingo.use_format_v2)
+        print(args)
+    checkpoint = torch.load(args.checkpoint_path, map_location="cpu")
+    model_state_dict = {}
+    for key in checkpoint["model_state_dict"].keys():
+        model_state_dict[key.replace("module.", "")] = checkpoint["model_state_dict"][key]
+    if "vision_encoder.logit_scale"in model_state_dict:
+        # previous checkpoint has some unnecessary weights
+        del model_state_dict["vision_encoder.logit_scale"]
+        del model_state_dict["vision_encoder.visual.proj"]
+        del model_state_dict["vision_encoder.visual.ln_post.weight"]
+        del model_state_dict["vision_encoder.visual.ln_post.bias"]
+    flamingo.load_state_dict(model_state_dict, strict=True)
+    results = defaultdict(list)
+    if args.eval_coco:
+        print("Evaluating on COCO...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                cider_score = evaluate_coco_flickr(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    image_dir_path=args.coco_image_dir_path,
+                    annotations_json_path=args.coco_annotations_json_path,
+                    device=args.device,
+                    seed=seed,
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+                print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}")
+                scores.append(cider_score)
+            print(f"Shots {shot} Mean CIDEr score: {np.mean(scores)}")
+            results["coco"].append(
+                {"shots": shot, "trials": scores, "mean": np.mean(scores)}
+            )
+    if args.eval_ok_vqa:
+        print("Evaluating on OK-VQA...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                ok_vqa_score = evaluate_vqa(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    image_dir_path=args.ok_vqa_image_dir_path,
+                    questions_json_path=args.ok_vqa_questions_json_path,
+                    annotations_json_path=args.ok_vqa_annotations_json_path,
+                    vqa_dataset="ok_vqa",
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+            results["ok_vqa"].append(
+                {"shots": shot, "score": ok_vqa_score}
+            )
+    if args.eval_vqav2:
+        print("Evaluating on VQAv2...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                vqa_score = evaluate_vqa(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    image_dir_path=args.vqav2_image_dir_path,
+                    questions_json_path=args.vqav2_questions_json_path,
+                    annotations_json_path=args.vqav2_annotations_json_path,
+                    vqa_dataset="vqa",
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+            results["vqav2"].append(
+                {"shots": shot, "score": vqa_score}
+            )
+    if args.eval_gqa:
+        print("Evaluating on GQA...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                vqa_score = evaluate_vqa(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    vqa_dataset="gqa",
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+            results["gqa"].append(
+                {"shots": shot, "score": vqa_score}
+            )
+    if args.eval_refcoco:
+        print("Evaluating on RefCOCO...")
+        refcoco_score = evaluate_refcoco(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            batch_size=args.batch_size,
+            device=args.device,
+            tsvfile=args.refcoco_tsvfile,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["refcoco"].append(
+            {"score": refcoco_score}
+        )
+    if args.eval_aro:
+        print("Evaluating on ARO...")
+        aro_score = evaluate_aro(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            batch_size=args.batch_size,
+            device=args.device,
+            tsvfile=args.refcoco_tsvfile,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+            add_relation=args.relation,
+        )
+        results["aro"].append(
+            {"score": aro_score}
+        )
+    if args.eval_pisc:
+        print("Evaluating on ARO...")
+        aro_score = evaluate_pisc(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            batch_size=args.batch_size,
+            device=args.device,
+            tsvfile=args.refcoco_tsvfile,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["pisc"].append(
+            {"score": aro_score}
+        )
+    if args.eval_reg:
+        print("Evaluating on Referring Expression Generation...")
+        cider = evaluate_reg(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["reg"].append(
+            {"score": cider}
+        )
+    if args.eval_vlc:
+        print("Evaluating on VL-checklist...")
+        vlc_score = evaluate_vlc(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["vlc"].append(
+            {"score": vlc_score}
+        )
+    if args.eval_crepe:
+        print("Evaluating on CREPE...")
+        crepe_score = evaluate_crepe(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+            level=args.level,
+            type=args.type,
+        )
+        results["crepe"].append(
+            {"score": crepe_score}
+        )
+def prepare_batch_images(batch, image_processor):
+    batch_images = None
+    for b in batch:
+        b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        if batch_images is None:
+            batch_images = b_image
+        else:
+            batch_images = torch.cat([batch_images, b_image], dim=0)
+    return batch_images
+def get_outputs(
+    model,
+    batch_images,
+    attention_mask,
+    max_generation_length,
+    min_generation_length,
+    num_beams,
+    length_penalty,
+    input_ids,
+    image_start_index_list=None,
+    image_nums=None,
+    bad_words_ids=None,
+):
+    with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+        outputs = model.generate(
+            batch_images,
+            input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_generation_length,
+            min_length=min_generation_length,
+            num_beams=num_beams,
+            length_penalty=length_penalty,
+            image_start_index_list=image_start_index_list,
+            image_nums=image_nums,
+            bad_words_ids=bad_words_ids,
+        )
+    outputs = outputs[:, len(input_ids[0]) :]
+    return outputs
+def evaluate_coco_flickr(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    image_dir_path,
+    annotations_json_path,
+    seed=42,
+    max_generation_length=20,
+    num_beams=1,
+    length_penalty=-2.0,
+    device=-1,
+    is_flickr=False,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    """Evaluate a model on COCO dataset.
+    Args:
+        model (nn.Module): model to evaluate
+        tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model
+        image_processor : image processor for the model
+        batch_size (int): batch size
+        image_dir_path (str, optional): path to the directory containing the images.
+        annotations_json_path (str, optional): path to the json file containing the annotations.
+        seed (int, optional): seed for random number generator. Defaults to 42.
+        max_generation_length (int, optional): maximum length of the generated caption. Defaults to 10.
+        num_beams (int, optional): number of beams to use for beam search. Defaults to 3.
+        length_penalty (float, optional): length penalty for beam search. Defaults to -2.0.
+        num_samples (int, optional): number of samples to evaluate on. Defaults to 5000.
+        query_set_size (int, optional): number of samples to use for query set. Defaults to 2048.
+        num_shots (int, optional): number of in-context samples to use. Defaults to 8.
+        device (int, optional): device to use. Defaults to -1.
+        num_workers (int, optional): number of workers to use for dataloader. Defaults to 4.
+        is_flickr (bool): defines if that data is COCO or Flickr. Defaults to False (COCO).
+    Returns:
+        float: CIDEr score
+    """
+    # eval_dataset = COCOFlickrDataset(
+    #     image_dir_path=image_dir_path,
+    #     annotations_path=annotations_json_path,
+    #     is_flickr=is_flickr,
+    # )
+    coco_dataset = load_dataset("coco_caption")
+    eval_dataset = coco_dataset["test"]
+    model.eval().cuda()
+    predictions = defaultdict()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    # if "peft" in lang_encoder_name:
+        # lang_encoder_name = model.lang_encoder.base_model.model.__class__.__name__.lower()
+    try:
+        media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+        endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+        pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+        bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    except:
+        pass
+    def get_prompt(sample):
+        return f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"
+    tokenizer.padding_side = "left"
+    cnt = 0
+    if world_size > 1:
+        torch.distributed.barrier()
+    desc = "Running inference Flickr30" if is_flickr else "Running inference COCO"
+    for ii, batch in enumerate(more_itertools.chunked(
+        tqdm(eval_dataset, desc=desc, disable=(rank != 0)), batch_size
+    )):
+        if ii % world_size != rank:
+            continue
+        cnt += len(batch)
+        batch_images = prepare_batch_images(
+            batch=batch,
+            image_processor=image_processor,
+        ).cuda()
+        batch_text = [get_prompt(s) for s in batch]
+        encodings = tokenizer(
+            batch_text,
+            padding="longest",
+            truncation=True,
+            return_tensors="pt",
+            max_length=2000,
+        )
+        input_ids = encodings["input_ids"].cuda()
+        attention_mask = encodings["attention_mask"].cuda()
+        skip_special_tokens = False
+        if hasattr(model, "legacy") and model.legacy and "opt" in lang_encoder_name:
+            if rank == 0:
+                tqdm.write("use legacy model")
+            skip_special_tokens = True
+            for i in range(len(input_ids)):
+                media_token_index = (input_ids[i] == media_token_id).nonzero()[0,0]
+                endofmedia_token_index = (input_ids[i] == endofmedia_token_id).nonzero()[0,0]
+                input_ids[i, media_token_index - 1] = media_token_id
+                input_ids[i, media_token_index] = pad_token_id
+                input_ids[i, endofmedia_token_index - 1] = endofmedia_token_id
+                input_ids[i, endofmedia_token_index] = bos_token_id
+        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+        image_start_index_list = [[x] for x in image_start_index_list]
+        image_nums = [1] * len(input_ids)
+        if "llama" in lang_encoder_name:
+            attention_mask[input_ids == 0] = 0
+        outputs = get_outputs(
+            model=model,
+            batch_images=batch_images,
+            attention_mask=attention_mask,
+            max_generation_length=30,
+            min_generation_length=8,
+            num_beams=5,
+            length_penalty=0,
+            input_ids=input_ids,
+            image_start_index_list=image_start_index_list,
+            image_nums=image_nums,
+        )
+        new_predictions = [
+            postprocess_captioning_generation(out).replace('"', "")
+            for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ]
+        # if rank == 0:
+        #     tqdm.write(f"{batch_images.shape} {batch[0]} pred: {new_predictions[0]}")
+        for i, sample in enumerate(batch):
+            predictions[int(sample["image_id"])] = {
+                "caption": new_predictions[i],
+            }
+    results_path = (
+        f"flickrresults_{lang_encoder_name}_{rank}_{id}.json"
+        if is_flickr
+        else f"cocoresults_{lang_encoder_name}_{rank}_{id}.json"
+    )
+    with open(results_path, "w") as f:
+        f.write(
+            json.dumps(
+                [
+                    {"image_id": k, "caption": predictions[k]["caption"]}
+                    for k in predictions
+                ],
+                indent=2,
+            )
+        )
+    print("save to", results_path)
+    del predictions
+    time.sleep(10)
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        predictions = []
+        for rank_i in range(world_size):
+            part_results_path = (
+                f"flickrresults_{lang_encoder_name}_{rank_i}_{id}.json"
+                if is_flickr
+                else f"cocoresults_{lang_encoder_name}_{rank_i}_{id}.json"
+            )
+            print("load", part_results_path)
+            predictions.extend(json.load(open(part_results_path)))
+            os.remove(part_results_path)
+        print("num:", len(predictions))
+        results_path = (
+            f"flickrresults_{lang_encoder_name}.json"
+            if is_flickr
+            else f"cocoresults_{lang_encoder_name}.json"
+        )
+        json.dump(predictions, open(results_path, "w"), indent=2)
+        metrics = compute_cider(
+            result_path=results_path,
+            annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/.cache/lavis/coco_gt/coco_karpathy_test_gt.json",
+        )
+        os.makedirs("eval_results", exist_ok=True)
+        acc = metrics["CIDEr"]
+        with open(os.path.join("eval_results", f"cococap_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
+            f.write(json.dumps(predictions, indent=2))
+        # delete the temporary file
+        os.remove(results_path)
+    else:
+        metrics = {}
+        metrics["CIDEr"] = 0.0
+    return metrics["CIDEr"]
+def evaluate_vqa(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    image_dir_path=None,
+    questions_json_path=None,
+    annotations_json_path=None,
+    vqa_dataset="vqa",
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    """
+    Evaluate a model on VQA datasets. Currently supports VQA v2.0.
+    Args:
+        model (nn.Module): model to evaluate
+        tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model
+        image_processor : image processor for the model
+        batch_size (int): batch size
+        image_dir_path (str): path to image directory
+        questions_json_path (str): path to questions json file
+        annotations_json_path (str): path to annotations json file
+        seed (int, optional): random seed. Defaults to 42.
+        max_generation_length (int, optional): max generation length. Defaults to 5.
+        num_beams (int, optional): number of beams to use for beam search. Defaults to 3.
+        length_penalty (float, optional): length penalty for beam search. Defaults to -2.0.
+        num_samples (int, optional): number of samples to evaluate on. Defaults to 5000 samples.
+        query_set_size (int, optional): size of the query set. Defaults to 2048.
+        num_shots (int, optional): number of shots to use. Defaults to 8.
+        device (int, optional): device to use. Defaults to -1 (cpu).
+        num_workers (int, optional): number of workers to use. Defaults to 4.
+        vqa_dataset (string): type of vqa dataset: currently supports vqa, ok_vqa. Defaults to vqa.
+    Returns:
+        float: accuracy score
+    """
+    if world_size > 1:
+        torch.distributed.barrier()
+    if vqa_dataset == "gqa":
+        eval_dataset = GQADataset()
+    else:
+        eval_dataset = VQADataset(
+            image_dir_path=image_dir_path,
+            question_path=questions_json_path,
+            annotations_path=annotations_json_path,
+            vqa_dataset=vqa_dataset,
+        )
+    postprocessor = OKVQAPostProcess()
+    try:
+        media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+        endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+        pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+        bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    except:
+        pass
+    def get_prompt(sample):
+        return f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:"
+        # return f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"
+    model.eval().cuda()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    if "peft" in lang_encoder_name:
+        lang_encoder_name = model.lang_encoder.base_model.model.__class__.__name__.lower()
+    predictions = []
+    tokenizer.padding_side = "left"
+    if world_size > 1:
+        torch.distributed.barrier()
+    this_tot = 0
+    for ii, batch in enumerate(more_itertools.chunked(
+        tqdm(eval_dataset, desc="Running inference", disable=(rank != 0)), batch_size
+    )):
+        if ii % world_size != rank:
+            continue
+        batch_images = prepare_batch_images(
+            batch=batch,
+            image_processor=image_processor,
+        ).cuda()
+        batch_text = [get_prompt(s) for s in batch]
+        encodings = tokenizer(
+            batch_text,
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            max_length=2000,
+        )
+        input_ids = encodings["input_ids"].cuda()
+        attention_mask = encodings["attention_mask"].cuda()
+        skip_special_tokens = True
+        if hasattr(model, "legacy") and model.legacy and "opt" in lang_encoder_name:
+            if rank == 0:
+                tqdm.write("use legacy model")
+            for i in range(len(input_ids)):
+                media_token_index = (input_ids[i] == media_token_id).nonzero()[0,0]
+                endofmedia_token_index = (input_ids[i] == endofmedia_token_id).nonzero()[0,0]
+                input_ids[i, media_token_index - 1] = media_token_id
+                input_ids[i, media_token_index] = pad_token_id
+                input_ids[i, endofmedia_token_index - 1] = endofmedia_token_id
+                input_ids[i, endofmedia_token_index] = bos_token_id
+        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+        image_start_index_list = [[x] for x in image_start_index_list]
+        image_nums = [1] * len(input_ids)
+        if "llama" in lang_encoder_name:
+            attention_mask[input_ids == 0] = 0
+        outputs = get_outputs(
+            model=model,
+            batch_images=batch_images,
+            attention_mask=attention_mask,
+            max_generation_length=10,
+            min_generation_length=1,
+            num_beams=5,
+            length_penalty=0,
+            input_ids=input_ids,
+            image_start_index_list=image_start_index_list,
+            image_nums=image_nums,
+        )
+        # postprocess begin
+        new_predictions = [
+            out.strip().lower().strip(string.punctuation+" ") for out in tokenizer.batch_decode(outputs, skip_special_tokens=skip_special_tokens)
+        ]
+        if vqa_dataset == "ok_vqa":
+            new_predictions = postprocessor._lemmatize(new_predictions)
+        if model.special:
+            for i in range(len(new_predictions)):
+                for answer, _ in Counter(batch[i]['answers']).most_common():
+                    if answer in new_predictions[i]:
+                        new_predictions[i] = answer
+                        break
+                    if "cant" in new_predictions[i] and "no" == answer:
+                        new_predictions[i] = answer
+                        break
+                    if "can" in new_predictions[i] and "not" not in new_predictions[i] and "cant" not in new_predictions[i] and "yes" == answer:
+                        new_predictions[i] = answer
+                        break
+        this_tot += 1
+        if rank == 0 and this_tot % 20 == 0:
+            for i in range(1):
+                tqdm.write("model output: " + new_predictions[i])
+        predictions.extend(
+            [
+                {"answer": p, "question_id": sample["question_id"], "_question": sample["question"], "answers": sample["answers"]}
+                for p, sample in zip(new_predictions, batch)
+            ]
+        )
+    with open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps(predictions))
+    print("save to", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json")
+    time.sleep(10)
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        predictions = []
+        for rank_i in range(world_size):
+            print("load", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
+            predictions.extend(json.load(open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")))
+            os.remove(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
+        print("num:", len(predictions))
+        # save the predictions to a temporary file
+        random_uuid = str(uuid.uuid4())
+        with open(f"{vqa_dataset}results_{random_uuid}.json", "w") as f:
+            f.write(json.dumps(predictions, indent=4))
+        if vqa_dataset == "gqa":
+            acc = compute_gqa_accuracy(predictions)
+        else:
+            acc = compute_vqa_accuracy(
+                f"{vqa_dataset}results_{random_uuid}.json",
+                questions_json_path,
+                annotations_json_path,
+                vqa_dataset=vqa_dataset,
+            )
+        print(vqa_dataset, "score:", acc, "| save to", f"{vqa_dataset}results_{random_uuid}.json")
+        os.makedirs("eval_results", exist_ok=True)
+        with open(os.path.join("eval_results", f"{vqa_dataset}_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
+            f.write(json.dumps(predictions, indent=2))
+        # delete the temporary file
+        os.remove(f"{vqa_dataset}results_{random_uuid}.json")
+    else:
+        time.sleep(5)
+        acc = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return acc
+def evaluate_refcoco(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    tsvfile,
+    max_generation_length=20,
+    num_beams=3,
+    length_penalty=-2.0,
+    device=-1,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    model.eval().cuda()
+    loc_token_ids = []
+    for i in range(1000):
+        loc_token_ids.append(int(tokenizer(f"<loc_{i}>", add_special_tokens=False)["input_ids"][-1]))
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+    bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    # all_ids = set(range(model.lang_encoder.lm_head.out_features))
+    # bad_words_ids = list(all_ids - set(loc_token_ids))
+    # bad_words_ids = [[b] for b in bad_words_ids]
+    # min_loc_token_id = min(loc_token_ids)
+    # max_loc_token_id = max(loc_token_ids)
+    total = 0
+    correct = 0
+    ious = []
+    if "refcocog" in tsvfile:
+        dataset_name = "refcocog"
+    elif "refcocoplus" in tsvfile:
+        dataset_name = "refcocoplus"
+    else:
+        dataset_name = "refcoco"
+    with open(tsvfile, "r") as f:
+        lines = f.readlines()
+        pbar = tqdm(lines, disable=(rank != 0))
+        for ii, line in enumerate(pbar):
+            if ii % world_size != rank:
+                continue
+            total += 1
+            line = line.rstrip()
+            uniq_id, image_id, text, region_coord, image = line.split("\t")
+            image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB")
+            # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/yolo.png").convert("RGB")
+            # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/cat.png").convert("RGB")
+            # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/262148000.png")
+            gt_box = np.array(list(map(float, region_coord.split(","))))
+            width = image.width
+            height = image.height
+            image = image.resize((224, 224))
+            gt_box = gt_box / np.array([width, height, width, height]) * 224
+            batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+            prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|>{text.rstrip('.').strip()}<|#endofobject#|><|#visual#|>"]
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>the cat<|#visual#|>"]
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"]
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>a man<|#visual#|> is doing a trick on a skateboard<|#visual#|>"]
+            encodings = tokenizer(
+                prompt,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=2000,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            # attention_mask[input_ids == prebox_token_id] = 0
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            model.debug_id = 0
+            with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=None,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=None,
+                    add_box=False,
+                )
+            boxes = outputs["boxes"]
+            scores = outputs["scores"]
+            if len(scores) > 0:
+                box = boxes[scores.argmax()]
+                iou = get_iou(box, gt_box)
+            else:
+                iou = 0.0
+                # tqdm.write(f"output: {tokenizer.batch_decode(outputs)}")
+                tqdm.write(f"no output for: {uniq_id}, {image_id}, {text}")
+            if iou >= 0.5:
+                correct += 1
+            pbar.set_description(f"iou: {iou:.2f} score: {correct / total:.4f}")
+            # open_cv_image = np.array(image)
+            # # Convert RGB to BGR
+            # open_cv_image = open_cv_image[:, :, ::-1].copy()
+            # for box, score in zip(boxes, scores):
+            #     open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2)
+            # cv2.imwrite("output.jpg", open_cv_image)
+            # print(boxes)
+            # print(scores)
+            # exit()
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([total, correct]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        total = 0
+        correct = 0
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [total_part, correct_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            total += total_part
+            correct += correct_part
+        score = correct / total
+        print("score:", score)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{score}"), "w") as f:
+            pass
+    else:
+        score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+# def preprocess_visual_info(Text):
+#     text = Text.split(" ")
+#     for is_idx, t in enumerate(text):
+#         if t == "is":
+#             break
+#     the_idx = is_idx
+#     while text[the_idx] != "the":
+#         the_idx -= 1
+#     obj_A = " ".join(text[the_idx+1:is_idx])
+#     second_the_idx = len(text) - 1
+#     while text[second_the_idx] != "the":
+#         second_the_idx -= 1
+#     obj_B =  " ".join(text[second_the_idx+1:])
+#     visual_obj_A = f"<|#object#|>{obj_A}<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|>"
+#     visual_obj_B = f"<|#object#|>{obj_B}<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|>"
+#     Text = Text.replace(obj_A, f"<|#object#|>{obj_A}<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|>")
+#     Text = Text.replace(obj_B, f"<|#object#|>{obj_B}<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|>")
+#     return Text, obj_A, obj_B, visual_obj_A, visual_obj_B
+def preprocess_visual_info(Text):
+    text = Text.split(" ")
+    for is_idx, t in enumerate(text):
+        if t == "is":
+            break
+    the_idx = is_idx
+    while text[the_idx] != "the":
+        the_idx -= 1
+    obj_A = " ".join(text[the_idx+1:is_idx])
+    second_the_idx = len(text) - 1
+    while text[second_the_idx] != "the":
+        second_the_idx -= 1
+    obj_B = " ".join(text[second_the_idx+1:])
+    relation = " ".join(text[is_idx+1:second_the_idx])
+    visual_obj_A = f"<|#object#|>the {obj_A}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>"
+    visual_obj_B = f"<|#object#|><|#previsual#|><|#prebox#|><|#object#|>the {obj_B}<|#endofobject#|>"
+    Text = f"{visual_obj_A} is {relation} {visual_obj_B}"
+    return Text, obj_A, visual_obj_A, obj_B, visual_obj_B, relation
+def get_bbox(visual_box_list, batch_images, prompt, model, tokenizer, media_token_id, prebox_token_id, debug=False, return_all=False):
+    assert isinstance(prompt, list) and len(prompt) == 1 and isinstance(prompt[0], str)
+    encodings = tokenizer(
+        prompt,
+        padding="longest",
+        truncation=True,
+        return_tensors="pt",
+        max_length=2000,
+    )
+    input_ids = encodings["input_ids"]
+    attention_mask = encodings["attention_mask"]
+    image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+    image_start_index_list = [[x] for x in image_start_index_list]
+    image_nums = [1] * len(input_ids)
+    vision_x = batch_images.cuda()
+    lang_x = input_ids.cuda()
+    attention_mask = attention_mask.cuda()
+    model.debug_id = 0
+    with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+        outputs = model(
+            vision_x=vision_x,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            labels=None,
+            image_nums=image_nums,
+            image_start_index_list=image_start_index_list,
+            added_bbox_list=visual_box_list,
+            add_box=visual_box_list is not None,
+            relations=None,
+            debug_mode=False,
+        )
+    boxes = outputs["boxes"]
+    scores = outputs["scores"]
+    if debug:
+        import pdb; pdb.set_trace()
+    if return_all:
+        return boxes, scores
+    if len(scores) == 0:
+        return None, None
+    else:
+        return boxes[scores.argmax()], scores.max()
+def evaluate_aro(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    tsvfile,
+    max_generation_length=20,
+    num_beams=3,
+    length_penalty=-2.0,
+    device=-1,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    add_visual=True,
+    add_relation=False,
+    subset=False,
+    choose_left_right=True,
+):
+    both_failed_ids = json.load(open("both_failed_ids.json"))
+    os.makedirs(f"visualization/aro_results_{id}", exist_ok=True)
+    # from groundingdino.demo.caption_grounder import caption_grounder
+    # generator = caption_grounder(
+    #     config_file="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
+    #     checkpoint_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
+    #     cpu_only=False,
+    #     box_threshold=0.1, text_threshold=0.1,
+    # )
+    dataset_name = "aro"
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+    endofattr_token_id = tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    model.eval().cuda()
+    total = 0
+    correct = 0
+    from open_flamingo.eval.dataset_zoo import VG_Relation, VG_Attribution
+    vgr_dataset = VG_Relation(image_preprocess=None, download=True, root_dir="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/vision-language-models-are-bows/data")
+    with open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/unilm/kosmos-2/labels.json") as f:
+        all_labels = json.load(f)
+        label_ids = tokenizer(all_labels).input_ids
+        label_ids = sorted(list(set([x[0] for x in label_ids])))
+    if subset:
+        subset_idx = json.load(open("aro_subset.json"))
+        pbar = tqdm(subset_idx, disable=(rank != 0))
+    else:
+        pbar = tqdm(vgr_dataset, disable=(rank != 0))
+    for ii, sample in enumerate(pbar):
+        if subset:
+            ORI_IDX = int(sample)
+            sample = vgr_dataset[sample]
+            # if ORI_IDX != 19036:
+            #     continue
+        if ii % world_size != rank:
+            continue
+        # not_left_right = ("near" in sample["caption_options"][0] or "next to" in sample["caption_options"][0] or "in front of" in sample["caption_options"][0] or "behind" in sample["caption_options"][0]) or ("left" not in sample["caption_options"][0] and "right" not in sample["caption_options"][0])
+        # if (choose_left_right and not_left_right) or (not choose_left_right and not not_left_right):
+        #     if rank == 0:
+        #         tqdm.write(f"SKIP: {sample['caption_options'][1]}")
+        #     continue
+        total += 1
+        # image = sample["image_options"][0]
+        image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/man_on_hydrant.png").convert("RGB")
+        image = image.resize((224, 224))
+        # text = sample["caption_options"][1] # 1 is true caption
+        text = "the man is sitting on the fire hydrant"
+        batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        text, obj_A, visual_obj_A, obj_B, visual_obj_B, relation = preprocess_visual_info(text)
+        first_text = f"<|#object#|>the {obj_A}<|#endofobject#|><|#visual#|>"
+        prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{first_text}"]
+        first_box, first_score = get_bbox(None, batch_images, prompt, model, tokenizer, media_token_id, prebox_token_id, return_all=False)
+        # use grounding DINO to get the first bbox
+        # caption = f"{obj_A}"
+        # with torch.no_grad():
+        #     logits, boxes = generator.ground_caption_raw(image_pil=image, caption=caption)
+        #     boxes_filt, pred_phrases = generator.postprocess(logits, boxes, generator.ground_model, caption, generator.text_threshold, generator.box_threshold, with_logits=True)
+        # objects = {}
+        # for box, phrase in zip(boxes_filt, pred_phrases):
+        #     obj, score = phrase
+        #     obj = obj[0]
+        #     if obj not in objects:
+        #         objects[obj] = (score, box)
+        #     if objects[obj][0] < score:
+        #         objects[obj] = (score, box)
+        # try:
+        #     first_box = objects[obj_A][1].clone()
+        #     first_box[:2] -= first_box[2:] / 2
+        #     first_box[2:] += first_box[:2]
+        #     first_box = first_box.clamp(0, 0.99) * 224.0
+        #     first_box = first_box.numpy()
+        #     first_score = objects[obj_A][0]
+        # except:
+        #     first_box = None
+        if first_box is None:
+            text_A = "the " + obj_A
+            added_bbox_list = None
+        else:
+            text_A = visual_obj_A
+            added_bbox_list = [torch.tensor(first_box).unsqueeze(0).cuda() / 224]
+        prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text_A} is {relation}<|#object#|><|#previsual#|>"]
+        pre_boxes, pre_scores = get_bbox(added_bbox_list, batch_images, prompt, model, tokenizer, media_token_id,
+        prebox_token_id, return_all=True)
+        # open_cv_image = np.array(image)
+        # open_cv_image = open_cv_image[:, :, ::-1].copy()
+        # for box, score in zip(pre_box, pre_score):
+        #     print(box, score)
+        #     if score > 0.1:
+        #         open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (0, 255, 0), 2)
+        # cv2.imwrite(f"test1.jpg", open_cv_image)
+        # print(sample["caption_options"][idx])
+        # exit()
+        if pre_boxes is None:
+            pre_boxes = [np.array([0.0, 0.0, 223.0, 223.0])]
+            pre_scores = [1.0]
+        rank_list = []
+        # pre_boxes = [pre_boxes[0]]
+        # pre_scores = [pre_scores[0]]
+        for pre_box, pre_score in zip(pre_boxes, pre_scores):
+            prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text_A} is {relation}<|#object#|><|#previsual#|><|#prebox#|><|#object#|> the {obj_B}<|#endofobject#|>"]
+            encodings = tokenizer(
+                prompt,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=512,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            labels = lang_x.clone()
+            answer_start_idx = (labels == tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1]).nonzero()[-1][1] + 1
+            # pre_box = None
+            labels[0, :answer_start_idx] = -100
+            # # labels[labels == endofobject_token_id] = -100
+            # labels[:, 0] = -100
+            # labels[labels == visual_token_id] = -100
+            # labels[labels == box_token_id] = -100
+            # labels[labels == previsual_token_id] = -100
+            # labels[labels == prebox_token_id] = -100
+            # labels[labels == endofattr_token_id] = -100
+            # labels[labels == tokenizer.pad_token_id] = -100
+            # labels[labels == media_token_id] = -100
+            # labels[labels == endofmedia_token_id] = -100
+            answer_ids = tokenizer(f" {obj_B}", add_special_tokens=False)["input_ids"]
+            labels[input_ids == visual_token_id] = -100
+            labels[input_ids == box_token_id] = -100
+            labels[input_ids == endofattr_token_id] = -100
+            labels[input_ids == previsual_token_id] = -100
+            labels[input_ids == prebox_token_id] = -100
+            labels[torch.roll(input_ids == prebox_token_id, 1)] = -100
+            labels[torch.roll(input_ids == box_token_id, 1)] = -100
+            labels[:, 0] = -100
+            labels[input_ids == tokenizer.pad_token_id] = -100
+            labels[input_ids == media_token_id] = -100
+            labels[input_ids == endofmedia_token_id] = -100
+            added_bbox_list = None
+            if add_visual:
+                added_bbox_list = []
+                if first_box is not None:
+                    added_bbox_list.append(torch.tensor(first_box).unsqueeze(0).cuda().float() / 224)
+                if pre_box is not None:
+                    added_bbox_list.append(torch.tensor(pre_box).unsqueeze(0).cuda().float() / 224)
+            if added_bbox_list is not None and len(added_bbox_list) == 0:
+                added_bbox_list = None
+            with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=labels,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=added_bbox_list,
+                    add_box=added_bbox_list is not None,
+                    relations=None,
+                )
+            logits = outputs["logits"][0, answer_start_idx:]
+            # _rank = logits[0][label_ids].sort(descending=True).indices.tolist().index(label_ids.index(answer_ids[0]))
+            _rank = logits[0].sort(descending=True).indices.tolist().index(answer_ids[0])
+            print(tokenizer.decode(logits[0].sort(descending=True).indices.tolist()[:10]))
+            print(tokenizer.decode(logits[1].sort(descending=True).indices.tolist()[:10]))
+            rank_list.append(_rank)
+            # open_cv_image = np.array(image)
+            # open_cv_image = open_cv_image[:, :, ::-1].copy()
+            # if first_box is not None:
+            #     open_cv_image = cv2.rectangle(open_cv_image, first_box[:2].astype(int), first_box[2:].astype(int), (255, 0, 0), 2)
+            # if pre_box is not None:
+            #     open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 255, 0), 2)
+            # font = cv2.FONT_HERSHEY_SIMPLEX
+            # org = [10, 20]
+            # fontScale = 0.5
+            # color = (0, 0, 0)
+            # thickness = 1
+            # open_cv_image = cv2.resize(open_cv_image, (512, 512))
+            # put_text = sample["caption_options"][1]
+            # open_cv_image = cv2.putText(open_cv_image, put_text, org, font, fontScale, color, thickness, cv2.LINE_AA)
+            # org[1] += 20
+            # put_text = "top10 in green box"
+            # open_cv_image = cv2.putText(open_cv_image, put_text, org, font, fontScale, color, thickness, cv2.LINE_AA)
+            # fontScale = 1.0
+            # thickness = 2
+            # for ind in logits_list[i][0].sort(descending=True).indices[:10]:
+            #     org[1] += 20
+            #     put_text = f"{tokenizer.decode(ind)}"
+            #     open_cv_image = cv2.putText(open_cv_image, put_text, org, font, fontScale, color, thickness, cv2.LINE_AA)
+            # tqdm.write(f"{tokenizer.decode(logits_list[i][0].sort(descending=True).indices[:10])}")
+        # tqdm.write(f"{rank_list}")
+        final_rank = min(rank_list)
+        if final_rank < 10:
+            correct += 1
+            TYPE = "CORRECT"
+            # if ii in both_failed_ids:
+            #     tqdm.write(f"case find->{sample['caption_options'][1]}")
+            #     image.save(f"case_study/{ii}_{rank_list}_{sample['caption_options'][1]}.jpg")
+            if rank == 0:
+                tqdm.write(f"correct: {final_rank} " + prompt[0].replace(tokenizer.pad_token, ""))
+        else:
+            TYPE = "WRONG"
+            if rank == 0:
+                tqdm.write(f"wrong: {final_rank} " + prompt[0].replace(tokenizer.pad_token, ""))
+        # cv2.imwrite(f"visualization/aro_results_{id}/{TYPE}_{ORI_IDX}.jpg", open_cv_image)
+        pbar.set_description(f"score: {correct / total:.4f} | {final_rank}")
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([total, correct]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        total = 0
+        correct = 0
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [total_part, correct_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            total += total_part
+            correct += correct_part
+        score = correct / total
+        print("score:", score, "total:", total)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{score}"), "w") as f:
+            pass
+    else:
+        score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+def evaluate_pisc(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    tsvfile,
+    max_generation_length=20,
+    num_beams=3,
+    length_penalty=-2.0,
+    device=-1,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    add_visual=True,
+):
+    from open_flamingo.train.instruction_template import PISC_TEMPLATES
+    dataset_name = "pisc"
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+    endofattr_token_id = tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    model.train().cuda()
+    dataset = wds.WebDataset("/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/instruct/eval/pisc/000000.tar").decode().to_tuple("image_path.txt", "dataset.txt", "data.pyd")
+    pbar = tqdm(dataset, disable=(rank != 0))
+    rel_id_to_type = ["friends", "family", "couple", "professional", "commercial", "no relation"]
+    rel_type_to_id = {x: i for i, x in enumerate(rel_id_to_type)}
+    gt = []
+    pred_scores = []
+    for III, sample in enumerate(pbar):
+        if III % world_size != rank:
+            continue
+        image_path, dataset, data = sample
+        image = Image.open(image_path)
+        size = image_processor.transforms[0].size
+        image = image.resize((size, size))
+        batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        boxA = data[0]
+        boxB = data[1]
+        gt_relation = data[2]
+        losses = []
+        for i_rel, option_rel in enumerate(rel_id_to_type):
+            text = PISC_TEMPLATES[0].format(relation=option_rel)
+            added_bbox = [
+                torch.tensor([boxA]).cuda(),
+                torch.tensor([boxB]).cuda(),
+            ]
+            caption = f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text}{tokenizer.eos_token}"
+            encodings = tokenizer(
+                caption,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=2000,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            labels = lang_x.clone()
+            labels[labels == tokenizer.pad_token_id] = -100
+            if add_visual:
+                # endofattr_next_token_index = list((labels == endofattr_token_id).nonzero(as_tuple=True))
+                # endofattr_next_token_index[1] += 1
+                # endofattr_next_token_id = labels[endofattr_next_token_index]
+                # </obj><visual><box></attr>NEXT_WORD
+                # </obj> predict NEXT_WORD
+                # <visual><box></attr> predict nothing
+                labels[labels == visual_token_id] = -100
+                labels[labels == box_token_id] = -100
+                labels[labels == endofattr_token_id] = -100
+                # labels[endofattr_next_token_index] = -100
+            labels[:, 0] = -100
+            answer_token_id = tokenizer(" Answer").input_ids[0]
+            answer_token_loc = (input_ids == answer_token_id).nonzero()
+            for batch_idx, idx in answer_token_loc:
+                labels[batch_idx][:idx+2] = -100
+            with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=labels,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=added_bbox,
+                    add_box=added_bbox is not None,
+                )
+                loss_total = outputs.loss.reshape(labels.shape[0], -1)
+                loss = loss_total.sum() / (loss_total != 0).sum()
+                losses.append(loss.item())
+        pred_scores.append(np.exp(-np.array(losses)) / np.exp(-np.array(losses)).sum())
+        gt.append(rel_type_to_id[gt_relation])
+    gt = np.array(gt)
+    pred_scores = np.array(pred_scores)
+    pred = pred_scores.argmax(1)
+    print("total num:", len(gt))
+    recalls = recall_score(y_true=gt, y_pred=pred, average=None, labels=[0,1,2,3,4,5])
+    print("recalls:", recalls)
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([gt.tolist(), pred.tolist()]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        gt = []
+        pred = []
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [gt_part, pred_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            gt.extend(gt_part)
+            pred.extend(pred_part)
+        print("total num:", len(gt))
+        recalls = recall_score(y_true=gt, y_pred=pred, average=None, labels=[0,1,2,3,4,5])
+        print("recalls:", recalls)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}"), "w") as f:
+            f.write(f"{gt}\n")
+            f.write(f"{pred}\n")
+            f.write(f"{recalls}\n")
+    score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+if __name__ == "__main__":
+    main()

multimodal/build/lib/open_flamingo/eval/evaluate_temp.py ADDED Viewed

	@@ -0,0 +1,1838 @@

+import argparse
+import json
+from math import ceil
+import os
+import random
+import uuid
+from collections import defaultdict
+from typing import Callable
+import time
+import cv2
+import webdataset as wds
+from sklearn.metrics import recall_score, average_precision_score
+import more_itertools
+import numpy as np
+import torch
+from coco_metric import compute_cider, postprocess_captioning_generation
+from eval_datasets import VQADataset, GQADataset
+from tqdm import tqdm
+from collections import Counter
+from vqa_metric import compute_vqa_accuracy, compute_gqa_accuracy
+from open_flamingo.eval.classification import (
+    compute_per_sample_probs,
+    compute_per_sample_loss,
+)
+from open_flamingo.eval.imagenet_utils import (
+    openai_imagenet_classnames,
+    IMAGENET_1K_CLASS_ID_TO_LABEL,
+)
+from open_flamingo.src.factory import create_model_and_transforms
+from PIL import Image
+from io import BytesIO
+import base64
+from open_flamingo.train.distributed import init_distributed_device, world_info_from_env
+import string
+from lavis.datasets.builders import load_dataset
+def get_iou(box1, box2):
+    # box1 and box2 should be in the format [x1, y1, x2, y2]
+    intersection = max(0, min(box1[2], box2[2]) - max(box1[0], box2[0])) * \
+                   max(0, min(box1[3], box2[3]) - max(box1[1], box2[1]))
+    area_box1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    area_box2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union = area_box1 + area_box2 - intersection
+    iou = intersection / union if union > 0 else 0
+    return iou
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+parser = argparse.ArgumentParser()
+parser.add_argument("--lm_path", type=str, default="facebook/opt-1.3b")
+parser.add_argument("--lm_tokenizer_path", type=str, default="facebook/opt-30b")
+parser.add_argument("--vision_encoder_path", default="ViT-L-14", type=str)
+parser.add_argument("--vision_encoder_pretrained", default="openai", type=str)
+parser.add_argument("--checkpoint_path", type=str, required=True)
+parser.add_argument(
+    "--results_file", type=str, default=None, help="JSON file to save results"
+)
+# Trial arguments
+parser.add_argument("--shots", nargs="+", default=[0, 4, 8, 16, 32], type=int)
+parser.add_argument(
+    "--num_trials",
+    type=int,
+    default=1,
+    help="Number of trials to run for each shot using different demonstrations",
+)
+parser.add_argument(
+    "--trial_seeds",
+    nargs="+",
+    default=[0],
+    help="Seeds to use for each trial for picking demonstrations and eval sets",
+)
+parser.add_argument(
+    "--num_samples", type=int, default=5000, help="Number of samples to evaluate on"
+)
+parser.add_argument("--batch_size", type=int, default=8)
+# Per-dataset evaluation flags
+parser.add_argument(
+    "--eval_coco",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on COCO.",
+)
+parser.add_argument(
+    "--eval_vqav2",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on VQAV2.",
+)
+parser.add_argument(
+    "--eval_ok_vqa",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on OK-VQA.",
+)
+parser.add_argument(
+    "--eval_imagenet",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on ImageNet.",
+)
+parser.add_argument(
+    "--eval_flickr30",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on Flickr30.",
+)
+parser.add_argument(
+    "--eval_refcoco",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on RefCOCO.",
+)
+# Dataset arguments
+## Flickr30 Dataset
+parser.add_argument(
+    "--flickr_image_dir_path",
+    type=str,
+    help="Path to the flickr30/flickr30k_images directory.",
+    default=None,
+)
+parser.add_argument(
+    "--flickr_annotations_json_path",
+    type=str,
+    help="Path to the dataset_flickr30k_coco_style.json file.",
+    default=None,
+)
+## COCO Dataset
+parser.add_argument(
+    "--coco_image_dir_path",
+    type=str,
+    help="Path to the flickr30/flickr30k_images directory.",
+    default=None,
+)
+parser.add_argument(
+    "--coco_annotations_json_path",
+    type=str,
+    default=None,
+)
+## VQAV2 Dataset
+parser.add_argument(
+    "--vqav2_image_dir_path",
+    type=str,
+    default=None,
+)
+parser.add_argument(
+    "--vqav2_questions_json_path",
+    type=str,
+    default=None,
+)
+parser.add_argument(
+    "--vqav2_annotations_json_path",
+    type=str,
+    default=None,
+)
+## OK-VQA Dataset
+parser.add_argument(
+    "--ok_vqa_image_dir_path",
+    type=str,
+    help="Path to the vqav2/train2014 directory.",
+    default=None,
+)
+parser.add_argument(
+    "--ok_vqa_questions_json_path",
+    type=str,
+    help="Path to the v2_OpenEnded_mscoco_train2014_questions.json file.",
+    default=None,
+)
+parser.add_argument(
+    "--ok_vqa_annotations_json_path",
+    type=str,
+    help="Path to the v2_mscoco_train2014_annotations.json file.",
+    default=None,
+)
+## Imagenet dataset
+parser.add_argument("--imagenet_root", type=str, default="/tmp")
+## RefCOCO dataset
+parser.add_argument("--refcoco_tsvfile", type=str, default=None)
+parser.add_argument(
+    "--location_token_num",
+    default=1000,
+    type=int,
+)
+# distributed training
+parser.add_argument(
+    "--dist-url",
+    default="env://",
+    type=str,
+    help="url used to set up distributed training",
+)
+parser.add_argument(
+    "--dist-backend", default="nccl", type=str, help="distributed backend"
+)
+parser.add_argument(
+    "--horovod",
+    default=False,
+    action="store_true",
+    help="Use horovod for distributed training.",
+)
+parser.add_argument(
+    "--no-set-device-rank",
+    default=False,
+    action="store_true",
+    help="Don't set device index from local rank (when CUDA_VISIBLE_DEVICES restricted to one per proc).",
+)
+parser.add_argument(
+    "--dist",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--lora",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--lora_r",
+    default=16,
+    type=int,
+    required=False,
+)
+parser.add_argument(
+    "--legacy",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--special",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--id",
+    default=0,
+    type=int,
+    required=False,
+)
+parser.add_argument(
+    "--eval_gqa",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--use_sam",
+    default=None,
+    type=str,
+    required=False,
+)
+parser.add_argument(
+    "--add_visual_token",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--use_format_v2",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_aro",
+    default=False,
+    action="store_true",
+)
+parser.add_argument(
+    "--eval_pisc",
+    default=False,
+    action="store_true",
+)
+class OKVQAPostProcess():
+    def __init__(self):
+        self._lemmatizer = None
+    def _lemmatize(self, answers):
+        def apply(answer):
+            doc = self.lemmatizer(answer)
+            words = []
+            for token in doc:
+                if token.pos_ in ["NOUN", "VERB"]:
+                    words.append(token.lemma_)
+                else:
+                    words.append(token.text)
+            answer = " ".join(words)
+            return answer
+        return [apply(answer) for answer in answers]
+    @property
+    def lemmatizer(self):
+        if self._lemmatizer is None:
+            try:
+                import spacy
+                self._lemmatizer = spacy.load("en_core_web_sm")
+            except ImportError:
+                logging.error(
+                    """
+                    Please install spacy and en_core_web_sm model to apply lemmatization.
+                    python -m spacy download en_core_web_sm
+                    OR
+                    import spacy.cli
+                    spacy.cli.download("en_core_web_sm")
+                    """
+                )
+                exit(1)
+        return self._lemmatizer
+def main():
+    args = parser.parse_args()
+    if args.dist:
+        args.local_rank, args.rank, args.world_size = world_info_from_env()
+        print(f"local_rank: {args.local_rank} rank: {args.rank} world_size: {args.world_size}")
+        device_id = init_distributed_device(args)
+    else:
+        args.rank = 0
+        args.world_size = 1
+        print(f"rank: {args.rank} world_size: {args.world_size}")
+    if "sam" in args.checkpoint_path:
+        args.use_sam = "vit_l"
+    args.add_visual_token = True
+    if "lora" in args.checkpoint_path:
+        args.lora = True
+    args.add_pe = False
+    args.add_box = True
+    args.relation = False
+    args.enhance_data = False
+    args.use_format_v2 = True
+    import hashlib
+    args.id = hashlib.sha224(args.checkpoint_path.encode()).hexdigest()
+    # load model
+    flamingo, image_processor, tokenizer, vis_embed_size = create_model_and_transforms(
+        args.vision_encoder_path,
+        args.vision_encoder_pretrained,
+        args.lm_path,
+        args.lm_tokenizer_path,
+        location_token_num=args.location_token_num,
+        lora=args.lora,
+        lora_r=16,
+        use_sam=args.use_sam,
+        add_visual_token=args.add_visual_token,
+        use_format_v2=args.use_format_v2,
+        add_box=args.add_box,
+        add_pe=args.add_pe,
+        add_relation=args.relation,
+        enhance_data=args.enhance_data,
+    )
+    flamingo.use_format_v2 = args.use_format_v2
+    if args.special:
+        flamingo.special = True
+    else:
+        flamingo.special = False
+    if args.legacy:
+        flamingo.legacy = True
+        print("use legacy evaluation")
+    flamingo.step_num = int(args.checkpoint_path.split("/")[-1].split(".")[0].split("_")[-1])
+    flamingo.expr_name = args.checkpoint_path.split("/")[-2]
+    if args.rank == 0:
+        print("legacy", True if hasattr(flamingo, "legacy") else False)
+        print("step:", flamingo.step_num)
+        print("expr:", flamingo.expr_name)
+        print("use format v2:", flamingo.use_format_v2)
+        print(args)
+    checkpoint = torch.load(args.checkpoint_path, map_location="cpu")
+    model_state_dict = {}
+    for key in checkpoint["model_state_dict"].keys():
+        model_state_dict[key.replace("module.", "")] = checkpoint["model_state_dict"][key]
+    if "vision_encoder.logit_scale"in model_state_dict:
+        # previous checkpoint has some unnecessary weights
+        del model_state_dict["vision_encoder.logit_scale"]
+        del model_state_dict["vision_encoder.visual.proj"]
+        del model_state_dict["vision_encoder.visual.ln_post.weight"]
+        del model_state_dict["vision_encoder.visual.ln_post.bias"]
+    flamingo.load_state_dict(model_state_dict, strict=True)
+    results = defaultdict(list)
+    if args.eval_coco:
+        print("Evaluating on COCO...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                cider_score = evaluate_coco_flickr(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    image_dir_path=args.coco_image_dir_path,
+                    annotations_json_path=args.coco_annotations_json_path,
+                    device=args.device,
+                    seed=seed,
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+                print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}")
+                scores.append(cider_score)
+            print(f"Shots {shot} Mean CIDEr score: {np.mean(scores)}")
+            results["coco"].append(
+                {"shots": shot, "trials": scores, "mean": np.mean(scores)}
+            )
+    if args.eval_ok_vqa:
+        print("Evaluating on OK-VQA...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                ok_vqa_score = evaluate_vqa(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    image_dir_path=args.ok_vqa_image_dir_path,
+                    questions_json_path=args.ok_vqa_questions_json_path,
+                    annotations_json_path=args.ok_vqa_annotations_json_path,
+                    vqa_dataset="ok_vqa",
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+            results["ok_vqa"].append(
+                {"shots": shot, "score": ok_vqa_score}
+            )
+    if args.eval_vqav2:
+        print("Evaluating on VQAv2...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                vqa_score = evaluate_vqa(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    image_dir_path=args.vqav2_image_dir_path,
+                    questions_json_path=args.vqav2_questions_json_path,
+                    annotations_json_path=args.vqav2_annotations_json_path,
+                    vqa_dataset="vqa",
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+            results["vqav2"].append(
+                {"shots": shot, "score": vqa_score}
+            )
+    if args.eval_gqa:
+        print("Evaluating on GQA...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                vqa_score = evaluate_vqa(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    vqa_dataset="gqa",
+                    vis_embed_size=vis_embed_size,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    id=args.id,
+                )
+            results["gqa"].append(
+                {"shots": shot, "score": vqa_score}
+            )
+    if args.eval_imagenet:
+        print("Evaluating on ImageNet...")
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                imagenet_score = evaluate_imagenet(
+                    model=flamingo,
+                    tokenizer=tokenizer,
+                    image_processor=image_processor,
+                    batch_size=args.batch_size,
+                    num_samples=args.num_samples,
+                    num_shots=shot,
+                    device=args.device,
+                    seed=seed,
+                    imagenet_root=args.imagenet_root,
+                )
+                print(
+                    f"Shots {shot} Trial {trial} " f"ImageNet score: {imagenet_score}"
+                )
+                scores.append(imagenet_score)
+            print(f"Shots {shot} Mean ImageNet score: {np.mean(scores)}")
+            results["imagenet"].append(
+                {"shots": shot, "trials": scores, "mean": np.mean(scores)}
+            )
+    if args.eval_refcoco:
+        print("Evaluating on RefCOCO...")
+        refcoco_score = evaluate_refcoco(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            batch_size=args.batch_size,
+            device=args.device,
+            tsvfile=args.refcoco_tsvfile,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["refcoco"].append(
+            {"score": refcoco_score}
+        )
+    if args.eval_aro:
+        print("Evaluating on ARO...")
+        _func = evaluate_aro
+        # print("Evaluating on ARO ORI...")
+        # _func = evaluate_aro_ori
+        aro_score = _func(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            batch_size=args.batch_size,
+            device=args.device,
+            tsvfile=args.refcoco_tsvfile,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+            add_relation=args.relation,
+        )
+        results["aro"].append(
+            {"score": aro_score}
+        )
+    if args.eval_pisc:
+        print("Evaluating on ARO...")
+        aro_score = evaluate_pisc(
+            model=flamingo,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            batch_size=args.batch_size,
+            device=args.device,
+            tsvfile=args.refcoco_tsvfile,
+            vis_embed_size=vis_embed_size,
+            rank=args.rank,
+            world_size=args.world_size,
+            id=args.id,
+        )
+        results["pisc"].append(
+            {"score": aro_score}
+        )
+def prepare_batch_images(batch, image_processor):
+    batch_images = None
+    for b in batch:
+        b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        if batch_images is None:
+            batch_images = b_image
+        else:
+            batch_images = torch.cat([batch_images, b_image], dim=0)
+    return batch_images
+def get_outputs(
+    model,
+    batch_images,
+    attention_mask,
+    max_generation_length,
+    min_generation_length,
+    num_beams,
+    length_penalty,
+    input_ids,
+    image_start_index_list=None,
+    image_nums=None,
+    bad_words_ids=None,
+):
+    with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+        outputs = model.generate(
+            batch_images,
+            input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_generation_length,
+            min_length=min_generation_length,
+            num_beams=num_beams,
+            length_penalty=length_penalty,
+            image_start_index_list=image_start_index_list,
+            image_nums=image_nums,
+            bad_words_ids=bad_words_ids,
+        )
+    outputs = outputs[:, len(input_ids[0]) :]
+    return outputs
+def evaluate_coco_flickr(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    image_dir_path,
+    annotations_json_path,
+    seed=42,
+    max_generation_length=20,
+    num_beams=1,
+    length_penalty=-2.0,
+    device=-1,
+    is_flickr=False,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    """Evaluate a model on COCO dataset.
+    Args:
+        model (nn.Module): model to evaluate
+        tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model
+        image_processor : image processor for the model
+        batch_size (int): batch size
+        image_dir_path (str, optional): path to the directory containing the images.
+        annotations_json_path (str, optional): path to the json file containing the annotations.
+        seed (int, optional): seed for random number generator. Defaults to 42.
+        max_generation_length (int, optional): maximum length of the generated caption. Defaults to 10.
+        num_beams (int, optional): number of beams to use for beam search. Defaults to 3.
+        length_penalty (float, optional): length penalty for beam search. Defaults to -2.0.
+        num_samples (int, optional): number of samples to evaluate on. Defaults to 5000.
+        query_set_size (int, optional): number of samples to use for query set. Defaults to 2048.
+        num_shots (int, optional): number of in-context samples to use. Defaults to 8.
+        device (int, optional): device to use. Defaults to -1.
+        num_workers (int, optional): number of workers to use for dataloader. Defaults to 4.
+        is_flickr (bool): defines if that data is COCO or Flickr. Defaults to False (COCO).
+    Returns:
+        float: CIDEr score
+    """
+    # eval_dataset = COCOFlickrDataset(
+    #     image_dir_path=image_dir_path,
+    #     annotations_path=annotations_json_path,
+    #     is_flickr=is_flickr,
+    # )
+    coco_dataset = load_dataset("coco_caption")
+    eval_dataset = coco_dataset["test"]
+    model.eval().cuda()
+    predictions = defaultdict()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    # if "peft" in lang_encoder_name:
+        # lang_encoder_name = model.lang_encoder.base_model.model.__class__.__name__.lower()
+    try:
+        media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+        endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+        pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+        bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    except:
+        pass
+    def get_prompt(sample):
+        return f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"
+    tokenizer.padding_side = "left"
+    cnt = 0
+    if world_size > 1:
+        torch.distributed.barrier()
+    desc = "Running inference Flickr30" if is_flickr else "Running inference COCO"
+    for ii, batch in enumerate(more_itertools.chunked(
+        tqdm(eval_dataset, desc=desc, disable=(rank != 0)), batch_size
+    )):
+        if ii % world_size != rank:
+            continue
+        cnt += len(batch)
+        batch_images = prepare_batch_images(
+            batch=batch,
+            image_processor=image_processor,
+        ).cuda()
+        batch_text = [get_prompt(s) for s in batch]
+        encodings = tokenizer(
+            batch_text,
+            padding="longest",
+            truncation=True,
+            return_tensors="pt",
+            max_length=2000,
+        )
+        input_ids = encodings["input_ids"].cuda()
+        attention_mask = encodings["attention_mask"].cuda()
+        skip_special_tokens = False
+        if hasattr(model, "legacy") and model.legacy and "opt" in lang_encoder_name:
+            if rank == 0:
+                tqdm.write("use legacy model")
+            skip_special_tokens = True
+            for i in range(len(input_ids)):
+                media_token_index = (input_ids[i] == media_token_id).nonzero()[0,0]
+                endofmedia_token_index = (input_ids[i] == endofmedia_token_id).nonzero()[0,0]
+                input_ids[i, media_token_index - 1] = media_token_id
+                input_ids[i, media_token_index] = pad_token_id
+                input_ids[i, endofmedia_token_index - 1] = endofmedia_token_id
+                input_ids[i, endofmedia_token_index] = bos_token_id
+        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+        image_start_index_list = [[x] for x in image_start_index_list]
+        image_nums = [1] * len(input_ids)
+        if "llama" in lang_encoder_name:
+            attention_mask[input_ids == 0] = 0
+        outputs = get_outputs(
+            model=model,
+            batch_images=batch_images,
+            attention_mask=attention_mask,
+            max_generation_length=30,
+            min_generation_length=8,
+            num_beams=5,
+            length_penalty=0,
+            input_ids=input_ids,
+            image_start_index_list=image_start_index_list,
+            image_nums=image_nums,
+        )
+        new_predictions = [
+            postprocess_captioning_generation(out).replace('"', "")
+            for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ]
+        # if rank == 0:
+        #     tqdm.write(f"{batch_images.shape} {batch[0]} pred: {new_predictions[0]}")
+        for i, sample in enumerate(batch):
+            predictions[int(sample["image_id"])] = {
+                "caption": new_predictions[i],
+            }
+    results_path = (
+        f"flickrresults_{lang_encoder_name}_{rank}_{id}.json"
+        if is_flickr
+        else f"cocoresults_{lang_encoder_name}_{rank}_{id}.json"
+    )
+    with open(results_path, "w") as f:
+        f.write(
+            json.dumps(
+                [
+                    {"image_id": k, "caption": predictions[k]["caption"]}
+                    for k in predictions
+                ],
+                indent=2,
+            )
+        )
+    print("save to", results_path)
+    del predictions
+    time.sleep(10)
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        predictions = []
+        for rank_i in range(world_size):
+            part_results_path = (
+                f"flickrresults_{lang_encoder_name}_{rank_i}_{id}.json"
+                if is_flickr
+                else f"cocoresults_{lang_encoder_name}_{rank_i}_{id}.json"
+            )
+            print("load", part_results_path)
+            predictions.extend(json.load(open(part_results_path)))
+            os.remove(part_results_path)
+        print("num:", len(predictions))
+        results_path = (
+            f"flickrresults_{lang_encoder_name}.json"
+            if is_flickr
+            else f"cocoresults_{lang_encoder_name}.json"
+        )
+        json.dump(predictions, open(results_path, "w"), indent=2)
+        metrics = compute_cider(
+            result_path=results_path,
+            annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/.cache/lavis/coco_gt/coco_karpathy_test_gt.json",
+        )
+        os.makedirs("eval_results", exist_ok=True)
+        acc = metrics["CIDEr"]
+        with open(os.path.join("eval_results", f"cococap_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
+            f.write(json.dumps(predictions, indent=2))
+        # delete the temporary file
+        os.remove(results_path)
+    else:
+        metrics = {}
+        metrics["CIDEr"] = 0.0
+    return metrics["CIDEr"]
+def evaluate_vqa(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    image_dir_path=None,
+    questions_json_path=None,
+    annotations_json_path=None,
+    vqa_dataset="vqa",
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    """
+    Evaluate a model on VQA datasets. Currently supports VQA v2.0.
+    Args:
+        model (nn.Module): model to evaluate
+        tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model
+        image_processor : image processor for the model
+        batch_size (int): batch size
+        image_dir_path (str): path to image directory
+        questions_json_path (str): path to questions json file
+        annotations_json_path (str): path to annotations json file
+        seed (int, optional): random seed. Defaults to 42.
+        max_generation_length (int, optional): max generation length. Defaults to 5.
+        num_beams (int, optional): number of beams to use for beam search. Defaults to 3.
+        length_penalty (float, optional): length penalty for beam search. Defaults to -2.0.
+        num_samples (int, optional): number of samples to evaluate on. Defaults to 5000 samples.
+        query_set_size (int, optional): size of the query set. Defaults to 2048.
+        num_shots (int, optional): number of shots to use. Defaults to 8.
+        device (int, optional): device to use. Defaults to -1 (cpu).
+        num_workers (int, optional): number of workers to use. Defaults to 4.
+        vqa_dataset (string): type of vqa dataset: currently supports vqa, ok_vqa. Defaults to vqa.
+    Returns:
+        float: accuracy score
+    """
+    if world_size > 1:
+        torch.distributed.barrier()
+    if vqa_dataset == "gqa":
+        eval_dataset = GQADataset()
+    else:
+        eval_dataset = VQADataset(
+            image_dir_path=image_dir_path,
+            question_path=questions_json_path,
+            annotations_path=annotations_json_path,
+            vqa_dataset=vqa_dataset,
+        )
+    postprocessor = OKVQAPostProcess()
+    try:
+        media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+        endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+        pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+        bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    except:
+        pass
+    def get_prompt(sample):
+        return f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:"
+        # return f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"
+    model.eval().cuda()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    if "peft" in lang_encoder_name:
+        lang_encoder_name = model.lang_encoder.base_model.model.__class__.__name__.lower()
+    predictions = []
+    tokenizer.padding_side = "left"
+    if world_size > 1:
+        torch.distributed.barrier()
+    this_tot = 0
+    for ii, batch in enumerate(more_itertools.chunked(
+        tqdm(eval_dataset, desc="Running inference", disable=(rank != 0)), batch_size
+    )):
+        if ii % world_size != rank:
+            continue
+        batch_images = prepare_batch_images(
+            batch=batch,
+            image_processor=image_processor,
+        ).cuda()
+        batch_text = [get_prompt(s) for s in batch]
+        encodings = tokenizer(
+            batch_text,
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            max_length=2000,
+        )
+        input_ids = encodings["input_ids"].cuda()
+        attention_mask = encodings["attention_mask"].cuda()
+        skip_special_tokens = True
+        if hasattr(model, "legacy") and model.legacy and "opt" in lang_encoder_name:
+            if rank == 0:
+                tqdm.write("use legacy model")
+            for i in range(len(input_ids)):
+                media_token_index = (input_ids[i] == media_token_id).nonzero()[0,0]
+                endofmedia_token_index = (input_ids[i] == endofmedia_token_id).nonzero()[0,0]
+                input_ids[i, media_token_index - 1] = media_token_id
+                input_ids[i, media_token_index] = pad_token_id
+                input_ids[i, endofmedia_token_index - 1] = endofmedia_token_id
+                input_ids[i, endofmedia_token_index] = bos_token_id
+        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+        image_start_index_list = [[x] for x in image_start_index_list]
+        image_nums = [1] * len(input_ids)
+        if "llama" in lang_encoder_name:
+            attention_mask[input_ids == 0] = 0
+        outputs = get_outputs(
+            model=model,
+            batch_images=batch_images,
+            attention_mask=attention_mask,
+            max_generation_length=10,
+            min_generation_length=1,
+            num_beams=5,
+            length_penalty=0,
+            input_ids=input_ids,
+            image_start_index_list=image_start_index_list,
+            image_nums=image_nums,
+        )
+        # postprocess begin
+        new_predictions = [
+            out.strip().lower().strip(string.punctuation+" ") for out in tokenizer.batch_decode(outputs, skip_special_tokens=skip_special_tokens)
+        ]
+        if vqa_dataset == "ok_vqa":
+            new_predictions = postprocessor._lemmatize(new_predictions)
+        if model.special:
+            for i in range(len(new_predictions)):
+                for answer, _ in Counter(batch[i]['answers']).most_common():
+                    if answer in new_predictions[i]:
+                        new_predictions[i] = answer
+                        break
+                    if "cant" in new_predictions[i] and "no" == answer:
+                        new_predictions[i] = answer
+                        break
+                    if "can" in new_predictions[i] and "not" not in new_predictions[i] and "cant" not in new_predictions[i] and "yes" == answer:
+                        new_predictions[i] = answer
+                        break
+        this_tot += 1
+        if rank == 0 and this_tot % 20 == 0:
+            for i in range(1):
+                tqdm.write(f"question: {batch[i]['question']}\nanswer: {batch[i]['answers']}model output: " + new_predictions[i])
+        predictions.extend(
+            [
+                {"answer": p, "question_id": sample["question_id"], "_question": sample["question"], "answers": sample["answers"]}
+                for p, sample in zip(new_predictions, batch)
+            ]
+        )
+    with open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps(predictions))
+    print("save to", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json")
+    time.sleep(10)
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        predictions = []
+        for rank_i in range(world_size):
+            print("load", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
+            predictions.extend(json.load(open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")))
+            os.remove(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
+        print("num:", len(predictions))
+        # save the predictions to a temporary file
+        random_uuid = str(uuid.uuid4())
+        with open(f"{vqa_dataset}results_{random_uuid}.json", "w") as f:
+            f.write(json.dumps(predictions, indent=4))
+        if vqa_dataset == "gqa":
+            acc = compute_gqa_accuracy(predictions)
+        else:
+            acc = compute_vqa_accuracy(
+                f"{vqa_dataset}results_{random_uuid}.json",
+                questions_json_path,
+                annotations_json_path,
+                vqa_dataset=vqa_dataset,
+            )
+        print(vqa_dataset, "score:", acc, "| save to", f"{vqa_dataset}results_{random_uuid}.json")
+        os.makedirs("eval_results", exist_ok=True)
+        with open(os.path.join("eval_results", f"{vqa_dataset}_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
+            f.write(json.dumps(predictions, indent=2))
+        # delete the temporary file
+        os.remove(f"{vqa_dataset}results_{random_uuid}.json")
+    else:
+        time.sleep(5)
+        acc = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return acc
+def evaluate_refcoco(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    tsvfile,
+    max_generation_length=20,
+    num_beams=3,
+    length_penalty=-2.0,
+    device=-1,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    model.eval().cuda()
+    loc_token_ids = []
+    for i in range(1000):
+        loc_token_ids.append(int(tokenizer(f"<loc_{i}>", add_special_tokens=False)["input_ids"][-1]))
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+    bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    # all_ids = set(range(model.lang_encoder.lm_head.out_features))
+    # bad_words_ids = list(all_ids - set(loc_token_ids))
+    # bad_words_ids = [[b] for b in bad_words_ids]
+    # min_loc_token_id = min(loc_token_ids)
+    # max_loc_token_id = max(loc_token_ids)
+    total = 0
+    correct = 0
+    ious = []
+    if "refcocog" in tsvfile:
+        dataset_name = "refcocog"
+    elif "refcocoplus" in tsvfile:
+        dataset_name = "refcocoplus"
+    else:
+        dataset_name = "refcoco"
+    with open(tsvfile, "r") as f:
+        lines = f.readlines()
+        pbar = tqdm(lines, disable=(rank != 0))
+        for ii, line in enumerate(pbar):
+            if ii % world_size != rank:
+                continue
+            total += 1
+            line = line.rstrip()
+            uniq_id, image_id, text, region_coord, image = line.split("\t")
+            image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB")
+            # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/yolo.png").convert("RGB")
+            # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/cat.png").convert("RGB")
+            # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/262148000.png")
+            gt_box = np.array(list(map(float, region_coord.split(","))))
+            width = image.width
+            height = image.height
+            image = image.resize((224, 224))
+            gt_box = gt_box / np.array([width, height, width, height]) * 224
+            batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+            prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|>{text.rstrip('.').strip()}<|#endofobject#|><|#visual#|>"]
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>the cat<|#visual#|>"]
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"]
+            # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>a man<|#visual#|> is doing a trick on a skateboard<|#visual#|>"]
+            encodings = tokenizer(
+                prompt,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=2000,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            # attention_mask[input_ids == prebox_token_id] = 0
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            model.debug_id = 0
+            with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=None,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=None,
+                    add_box=False,
+                )
+            boxes = outputs["boxes"]
+            scores = outputs["scores"]
+            if len(scores) > 0:
+                box = boxes[scores.argmax()]
+                iou = get_iou(box, gt_box)
+            else:
+                iou = 0.0
+                # tqdm.write(f"output: {tokenizer.batch_decode(outputs)}")
+                tqdm.write(f"no output for: {uniq_id}, {image_id}, {text}")
+            if iou >= 0.5:
+                correct += 1
+            pbar.set_description(f"iou: {iou:.2f} score: {correct / total:.4f}")
+            # open_cv_image = np.array(image)
+            # # Convert RGB to BGR
+            # open_cv_image = open_cv_image[:, :, ::-1].copy()
+            # for box, score in zip(boxes, scores):
+            #     open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2)
+            # cv2.imwrite("output.jpg", open_cv_image)
+            # print(boxes)
+            # print(scores)
+            # exit()
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([total, correct]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        total = 0
+        correct = 0
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [total_part, correct_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            total += total_part
+            correct += correct_part
+        score = correct / total
+        print("score:", score)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{score}"), "w") as f:
+            pass
+    else:
+        score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+def preprocess_visual_info(Text):
+    text = Text.split(" ")
+    for is_idx, t in enumerate(text):
+        if t == "is":
+            break
+    the_idx = is_idx
+    while text[the_idx] != "the":
+        the_idx -= 1
+    obj_A = " ".join(text[the_idx+1:is_idx])
+    second_the_idx = len(text) - 1
+    while text[second_the_idx] != "the":
+        second_the_idx -= 1
+    obj_B = " ".join(text[second_the_idx+1:])
+    relation = " ".join(text[is_idx+1:second_the_idx])
+    visual_obj_A = f"<|#object#|>the {obj_A}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>"
+    visual_obj_B = f"<|#object#|><|#previsual#|><|#prebox#|><|#object#|>the {obj_B}<|#endofobject#|>"
+    Text = f"{visual_obj_A} is {relation} {visual_obj_B}"
+    return Text, obj_A, visual_obj_A, obj_B, visual_obj_B, relation
+def get_bbox(visual_box_list, batch_images, prompt, model, tokenizer, media_token_id, prebox_token_id, mask_prebox, debug=False, return_all=False):
+    assert isinstance(prompt, list) and len(prompt) == 1 and isinstance(prompt[0], str)
+    encodings = tokenizer(
+        prompt,
+        padding="longest",
+        truncation=True,
+        return_tensors="pt",
+        max_length=2000,
+    )
+    input_ids = encodings["input_ids"]
+    attention_mask = encodings["attention_mask"]
+    image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+    image_start_index_list = [[x] for x in image_start_index_list]
+    image_nums = [1] * len(input_ids)
+    vision_x = batch_images.cuda()
+    lang_x = input_ids.cuda()
+    attention_mask = attention_mask.cuda()
+    prebox_mask = (input_ids == prebox_token_id)
+    if mask_prebox and prebox_mask.any():
+        attention_mask[prebox_mask] = 0
+    model.debug_id = 0
+    with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+        outputs = model(
+            vision_x=vision_x,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            labels=None,
+            image_nums=image_nums,
+            image_start_index_list=image_start_index_list,
+            added_bbox_list=visual_box_list,
+            add_box=visual_box_list is not None,
+            relations=None,
+            debug_mode=False,
+        )
+    boxes = outputs["boxes"]
+    scores = outputs["scores"]
+    if debug:
+        import pdb; pdb.set_trace()
+    if return_all:
+        return boxes, scores
+    if len(scores) == 0:
+        return None, None
+    else:
+        return boxes[scores.argmax()], scores.max()
+def evaluate_aro(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    tsvfile,
+    max_generation_length=20,
+    num_beams=3,
+    length_penalty=-2.0,
+    device=-1,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    add_visual=True,
+    add_relation=False,
+    subset=True,
+    choose_left_right=True,
+):
+    os.makedirs(f"visualization/aro_results_{id}", exist_ok=True)
+    from groundingdino.demo.caption_grounder import caption_grounder
+    generator = caption_grounder(
+        config_file="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
+        checkpoint_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
+        cpu_only=False,
+        box_threshold=0.1, text_threshold=0.1,
+    )
+    dataset_name = "aro"
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+    endofattr_token_id = tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    model.eval().cuda()
+    total = 0
+    correct = 0
+    from open_flamingo.eval.dataset_zoo import VG_Relation, VG_Attribution
+    vgr_dataset = VG_Relation(image_preprocess=None, download=True, root_dir="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/vision-language-models-are-bows/data")
+    with open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/unilm/kosmos-2/labels.json") as f:
+        all_labels = json.load(f)
+        label_ids = tokenizer(all_labels).input_ids
+        label_ids = sorted(list(set([x[0] for x in label_ids])))
+    if subset:
+        subset_idx = json.load(open("aro_subset.json"))
+        pbar = tqdm(subset_idx, disable=(rank != 0))
+    else:
+        pbar = tqdm(vgr_dataset, disable=(rank != 0))
+    exist_total = 0
+    for ii, sample in enumerate(pbar):
+        if subset:
+            ORI_IDX = int(sample)
+            sample = vgr_dataset[sample]
+            # if ORI_IDX != 19036:
+            #     continue
+        if ii % world_size != rank:
+            continue
+        not_left_right = ("near" in sample["caption_options"][0] or "next to" in sample["caption_options"][0] or "in front of" in sample["caption_options"][0] or "behind" in sample["caption_options"][0]) or ("left" not in sample["caption_options"][0] and "right" not in sample["caption_options"][0])
+        if (choose_left_right and not_left_right) or (not choose_left_right and not not_left_right):
+            if rank == 0:
+                tqdm.write(f"SKIP: {sample['caption_options'][1]}")
+            continue
+        total += 1
+        image = sample["image_options"][0]
+        # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/yolo.png").convert("RGB")
+        image = image.resize((224, 224))
+        chosen_idx = 0
+        text = sample["caption_options"][chosen_idx] # 1 is true caption
+        # text = "the dog is sitting on the floor" if idx == 1 else "the floor is sitting on the dog"
+        batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        text, obj_A, visual_obj_A, obj_B, visual_obj_B, relation = preprocess_visual_info(text)
+        first_text = f"<|#object#|>the {obj_A}<|#endofobject#|><|#visual#|>"
+        prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{first_text}"]
+        first_box, first_score = get_bbox(None, batch_images, prompt, model, tokenizer, media_token_id, prebox_token_id, mask_prebox=True, return_all=False)
+        # use grounding DINO to get the first bbox
+        # caption = f"{obj_A}"
+        # with torch.no_grad():
+        #     logits, boxes = generator.ground_caption_raw(image_pil=image, caption=caption)
+        #     boxes_filt, pred_phrases = generator.postprocess(logits, boxes, generator.ground_model, caption, generator.text_threshold, generator.box_threshold, with_logits=True)
+        # objects = {}
+        # for box, phrase in zip(boxes_filt, pred_phrases):
+        #     obj, score = phrase
+        #     obj = obj[0]
+        #     if obj not in objects:
+        #         objects[obj] = (score, box)
+        #     if objects[obj][0] < score:
+        #         objects[obj] = (score, box)
+        # try:
+        #     first_box = objects[obj_A][1].clone()
+        #     first_box[:2] -= first_box[2:] / 2
+        #     first_box[2:] += first_box[:2]
+        #     first_box = first_box.clamp(0, 0.99) * 224.0
+        #     first_box = first_box.numpy()
+        #     first_score = objects[obj_A][0]
+        # except:
+        #     first_box = None
+        if first_box is None:
+            text_A = "the " + obj_A
+            added_bbox_list = None
+        else:
+            text_A = visual_obj_A
+            added_bbox_list = [torch.tensor(first_box).unsqueeze(0).cuda() / 224]
+        prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text_A} is {relation}<|#object#|><|#previsual#|>"]
+        pre_boxes, pre_scores = get_bbox(added_bbox_list, batch_images, prompt, model, tokenizer, media_token_id,
+        prebox_token_id, mask_prebox=False, debug=False, return_all=True)
+        open_cv_image = np.array(image)
+        open_cv_image = open_cv_image[:, :, ::-1].copy()
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        fontScale = 0.5
+        color = (0, 0, 0)
+        thickness = 1
+        if first_box is not None:
+            open_cv_image = cv2.rectangle(open_cv_image, first_box[:2].astype(int), first_box[2:].astype(int), (255, 0, 0), 2)
+        exist_flag = False
+        for box, score in zip(pre_boxes, pre_scores):
+            if score >= 0.5:
+                exist_flag = True
+                open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (0, 255, 0), 2)
+                org = box[:2].astype(int)
+                org[1] += 20
+                org[0] += 10
+                open_cv_image = cv2.putText(open_cv_image, f"{score:.2f}", org, font, fontScale, (255, 255, 255), thickness, cv2.LINE_AA)
+        open_cv_image = cv2.resize(open_cv_image, (512, 512))
+        put_text = sample["caption_options"][chosen_idx]
+        org = [10, 20]
+        open_cv_image = cv2.putText(open_cv_image, put_text, org, font, fontScale, color, thickness, cv2.LINE_AA)
+        # cv2.imwrite(f"visualization/aro_results_{id}/{str(ORI_IDX).zfill(8)}.jpg", open_cv_image)
+        if exist_flag:
+            exist_total += 1
+        continue
+        if pre_boxes is None:
+            pre_boxes = [np.array([0.0, 0.0, 223.0, 223.0])]
+            pre_scores = [1.0]
+        rank_list = []
+        # pre_boxes = [pre_boxes[0]]
+        # pre_scores = [pre_scores[0]]
+        for pre_box, pre_score in zip(pre_boxes, pre_scores):
+            prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text_A} is {relation}<|#object#|><|#previsual#|><|#prebox#|><|#object#|> the {obj_B}<|#endofobject#|>"]
+            encodings = tokenizer(
+                prompt,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=512,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            labels = lang_x.clone()
+            answer_start_idx = (labels == tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1]).nonzero()[-1][1] + 1
+            # pre_box = None
+            labels[0, :answer_start_idx] = -100
+            # # labels[labels == endofobject_token_id] = -100
+            # labels[:, 0] = -100
+            # labels[labels == visual_token_id] = -100
+            # labels[labels == box_token_id] = -100
+            # labels[labels == previsual_token_id] = -100
+            # labels[labels == prebox_token_id] = -100
+            # labels[labels == endofattr_token_id] = -100
+            # labels[labels == tokenizer.pad_token_id] = -100
+            # labels[labels == media_token_id] = -100
+            # labels[labels == endofmedia_token_id] = -100
+            answer_ids = tokenizer(f" {obj_B}", add_special_tokens=False)["input_ids"]
+            labels[input_ids == visual_token_id] = -100
+            labels[input_ids == box_token_id] = -100
+            labels[input_ids == endofattr_token_id] = -100
+            labels[input_ids == previsual_token_id] = -100
+            labels[input_ids == prebox_token_id] = -100
+            labels[torch.roll(input_ids == prebox_token_id, 1)] = -100
+            labels[torch.roll(input_ids == box_token_id, 1)] = -100
+            labels[:, 0] = -100
+            labels[input_ids == tokenizer.pad_token_id] = -100
+            labels[input_ids == media_token_id] = -100
+            labels[input_ids == endofmedia_token_id] = -100
+            added_bbox_list = None
+            if add_visual:
+                added_bbox_list = []
+                if first_box is not None:
+                    added_bbox_list.append(torch.tensor(first_box).unsqueeze(0).cuda().float() / 224)
+                if pre_box is not None:
+                    added_bbox_list.append(torch.tensor(pre_box).unsqueeze(0).cuda().float() / 224)
+            if added_bbox_list is not None and len(added_bbox_list) == 0:
+                added_bbox_list = None
+            with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=labels,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=added_bbox_list,
+                    add_box=added_bbox_list is not None,
+                    relations=None,
+                )
+            logits = outputs["logits"][0, answer_start_idx:]
+            _rank = logits[0][label_ids].sort(descending=True).indices.tolist().index(label_ids.index(answer_ids[0]))
+            rank_list.append(_rank)
+            # open_cv_image = np.array(image)
+            # open_cv_image = open_cv_image[:, :, ::-1].copy()
+            # if first_box is not None:
+            #     open_cv_image = cv2.rectangle(open_cv_image, first_box[:2].astype(int), first_box[2:].astype(int), (255, 0, 0), 2)
+            # if pre_box is not None:
+            #     open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 255, 0), 2)
+            # font = cv2.FONT_HERSHEY_SIMPLEX
+            # org = [10, 20]
+            # fontScale = 0.5
+            # color = (0, 0, 0)
+            # thickness = 1
+            # open_cv_image = cv2.resize(open_cv_image, (512, 512))
+            # put_text = sample["caption_options"][1]
+            # open_cv_image = cv2.putText(open_cv_image, put_text, org, font, fontScale, color, thickness, cv2.LINE_AA)
+            # org[1] += 20
+            # put_text = "top10 in green box"
+            # open_cv_image = cv2.putText(open_cv_image, put_text, org, font, fontScale, color, thickness, cv2.LINE_AA)
+            # fontScale = 1.0
+            # thickness = 2
+            # for ind in logits_list[i][0].sort(descending=True).indices[:10]:
+            #     org[1] += 20
+            #     put_text = f"{tokenizer.decode(ind)}"
+            #     open_cv_image = cv2.putText(open_cv_image, put_text, org, font, fontScale, color, thickness, cv2.LINE_AA)
+            # tqdm.write(f"{tokenizer.decode(logits_list[i][0].sort(descending=True).indices[:10])}")
+        # tqdm.write(f"{rank_list}")
+        final_rank = min(rank_list)
+        if final_rank < 10:
+            correct += 1
+            TYPE = "CORRECT"
+            if rank == 0:
+                tqdm.write(f"correct: {final_rank} " + prompt[0].replace(tokenizer.pad_token, ""))
+        else:
+            TYPE = "WRONG"
+            if rank == 0:
+                tqdm.write(f"wrong: {final_rank} " + prompt[0].replace(tokenizer.pad_token, ""))
+        # cv2.imwrite(f"visualization/aro_results_{id}/{TYPE}_{ORI_IDX}.jpg", open_cv_image)
+        pbar.set_description(f"score: {correct / total:.4f} | {final_rank}")
+    print(exist_total)
+    exit()
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([total, correct]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        total = 0
+        correct = 0
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [total_part, correct_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            total += total_part
+            correct += correct_part
+        score = correct / total
+        print("score:", score, "total:", total)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{score}"), "w") as f:
+            pass
+    else:
+        score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+def evaluate_aro_ori(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    tsvfile,
+    max_generation_length=20,
+    num_beams=3,
+    length_penalty=-2.0,
+    device=-1,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    add_visual=True,
+    add_relation=False,
+    subset=True,
+    choose_left_right=True,
+    only_highest=True,
+):
+    os.makedirs(f"visualization/aro_results_{id}", exist_ok=True)
+    dataset_name = "aroori"
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+    endofattr_token_id = tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    model.eval().cuda()
+    total = 0
+    correct = 0
+    from open_flamingo.eval.dataset_zoo import VG_Relation, VG_Attribution
+    vgr_dataset = VG_Relation(image_preprocess=None, download=True, root_dir="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/vision-language-models-are-bows/data")
+    if subset:
+        subset_idx = json.load(open("aro_subset.json"))
+        pbar = tqdm(subset_idx, disable=(rank != 0))
+    else:
+        pbar = tqdm(vgr_dataset, disable=(rank != 0))
+    for ii, sample in enumerate(pbar):
+        if subset:
+            ORI_IDX = int(sample)
+            sample = vgr_dataset[sample]
+            # if ORI_IDX != 19036:
+            #     continue
+        if ii % world_size != rank:
+            continue
+        not_left_right = ("near" in sample["caption_options"][0] or "next to" in sample["caption_options"][0] or "in front of" in sample["caption_options"][0] or "behind" in sample["caption_options"][0]) or ("left" not in sample["caption_options"][0] and "right" not in sample["caption_options"][0])
+        if (choose_left_right and not_left_right) or (not choose_left_right and not not_left_right):
+            if rank == 0:
+                tqdm.write(f"SKIP: {sample['caption_options'][1]}")
+            continue
+        total += 1
+        image = sample["image_options"][0]
+        # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/yolo.png").convert("RGB")
+        image = image.resize((224, 224))
+        debug_data = []
+        final_losses = []
+        for idx in range(2):
+            text = sample["caption_options"][idx] # 1 is true caption
+            # text = "the dog is sitting on the floor" if idx == 1 else "the floor is sitting on the dog"
+            batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+            text, obj_A, visual_obj_A, obj_B, visual_obj_B, relation = preprocess_visual_info(text)
+            first_text = f"<|#object#|>the {obj_A}<|#endofobject#|><|#visual#|>"
+            prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{first_text}"]
+            first_box, first_score = get_bbox(None, batch_images, prompt, model, tokenizer, media_token_id, prebox_token_id, mask_prebox=True, return_all=False)
+            if first_box is None:
+                text_A = "the " + obj_A
+                added_bbox_list = None
+            else:
+                text_A = visual_obj_A
+                added_bbox_list = [torch.tensor(first_box).unsqueeze(0).cuda() / 224]
+            prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text_A} is {relation}<|#object#|><|#previsual#|>"]
+            pre_boxes, pre_scores = get_bbox(added_bbox_list, batch_images, prompt, model, tokenizer, media_token_id,
+            prebox_token_id, mask_prebox=False, debug=False, return_all=True)
+            if pre_boxes is None:
+                pre_boxes = [np.array([0.0, 0.0, 223.0, 223.0])]
+                pre_scores = [1.0]
+            loss_list = []
+            if only_highest:
+                pre_boxes = [pre_boxes[0]]
+                pre_scores = [pre_scores[0]]
+            for pre_box, pre_score in zip(pre_boxes, pre_scores):
+                prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text_A} is {relation}<|#object#|><|#previsual#|><|#prebox#|><|#object#|> the {obj_B}<|#endofobject#|>"]
+                encodings = tokenizer(
+                    prompt,
+                    padding="longest",
+                    truncation=True,
+                    return_tensors="pt",
+                    max_length=512,
+                )
+                input_ids = encodings["input_ids"]
+                attention_mask = encodings["attention_mask"]
+                image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+                image_start_index_list = [[x] for x in image_start_index_list]
+                image_nums = [1] * len(input_ids)
+                vision_x = batch_images.cuda()
+                lang_x = input_ids.cuda()
+                attention_mask = attention_mask.cuda()
+                labels = lang_x.clone()
+                labels[input_ids == visual_token_id] = -100
+                labels[input_ids == box_token_id] = -100
+                labels[input_ids == endofattr_token_id] = -100
+                labels[input_ids == previsual_token_id] = -100
+                labels[input_ids == prebox_token_id] = -100
+                labels[torch.roll(input_ids == prebox_token_id, 1)] = -100
+                labels[torch.roll(input_ids == box_token_id, 1)] = -100
+                labels[:, 0] = -100
+                labels[input_ids == tokenizer.pad_token_id] = -100
+                labels[input_ids == media_token_id] = -100
+                labels[input_ids == endofmedia_token_id] = -100
+                added_bbox_list = None
+                if add_visual:
+                    added_bbox_list = []
+                    if first_box is not None:
+                        added_bbox_list.append(torch.tensor(first_box).unsqueeze(0).cuda().float() / 224)
+                    if pre_box is not None:
+                        added_bbox_list.append(torch.tensor(pre_box).unsqueeze(0).cuda().float() / 224)
+                if added_bbox_list is not None and len(added_bbox_list) == 0:
+                    added_bbox_list = None
+                with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                    outputs = model(
+                        vision_x=vision_x,
+                        lang_x=lang_x,
+                        attention_mask=attention_mask,
+                        labels=labels,
+                        image_nums=image_nums,
+                        image_start_index_list=image_start_index_list,
+                        added_bbox_list=added_bbox_list,
+                        add_box=added_bbox_list is not None,
+                        relations=None,
+                    )
+                loss_list.append((outputs["loss"].sum() / (outputs["loss"] != 0).sum()).item())
+                debug_data.append([outputs, first_box, first_score, pre_box, pre_scores])
+            final_loss = min(loss_list)
+            final_losses.append(final_loss)
+        if final_losses[0] >= final_losses[1]:
+            correct += 1
+        else:
+            import pdb; pdb.set_trace()
+            pass
+        pbar.set_description(f"score: {correct / total:.4f} | {final_losses[0]:.2f} vs {final_losses[1]:.2f}")
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([total, correct]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        total = 0
+        correct = 0
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [total_part, correct_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            total += total_part
+            correct += correct_part
+        score = correct / total
+        print("score:", score, "total:", total)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{score}"), "w") as f:
+            pass
+    else:
+        score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+def evaluate_pisc(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    tsvfile,
+    max_generation_length=20,
+    num_beams=3,
+    length_penalty=-2.0,
+    device=-1,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    add_visual=True,
+):
+    from open_flamingo.train.instruction_template import PISC_TEMPLATES
+    dataset_name = "pisc"
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+    endofattr_token_id = tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    model.train().cuda()
+    dataset = wds.WebDataset("/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/instruct/eval/pisc/000000.tar").decode().to_tuple("image_path.txt", "dataset.txt", "data.pyd")
+    pbar = tqdm(dataset, disable=(rank != 0))
+    rel_id_to_type = ["friends", "family", "couple", "professional", "commercial", "no relation"]
+    rel_type_to_id = {x: i for i, x in enumerate(rel_id_to_type)}
+    gt = []
+    pred_scores = []
+    for III, sample in enumerate(pbar):
+        if III % world_size != rank:
+            continue
+        image_path, dataset, data = sample
+        image = Image.open(image_path)
+        size = image_processor.transforms[0].size
+        image = image.resize((size, size))
+        batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        boxA = data[0]
+        boxB = data[1]
+        gt_relation = data[2]
+        losses = []
+        for i_rel, option_rel in enumerate(rel_id_to_type):
+            text = PISC_TEMPLATES[0].format(relation=option_rel)
+            added_bbox = [
+                torch.tensor([boxA]).cuda(),
+                torch.tensor([boxB]).cuda(),
+            ]
+            caption = f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{text}{tokenizer.eos_token}"
+            encodings = tokenizer(
+                caption,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=2000,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            labels = lang_x.clone()
+            labels[labels == tokenizer.pad_token_id] = -100
+            if add_visual:
+                # endofattr_next_token_index = list((labels == endofattr_token_id).nonzero(as_tuple=True))
+                # endofattr_next_token_index[1] += 1
+                # endofattr_next_token_id = labels[endofattr_next_token_index]
+                # </obj><visual><box></attr>NEXT_WORD
+                # </obj> predict NEXT_WORD
+                # <visual><box></attr> predict nothing
+                labels[labels == visual_token_id] = -100
+                labels[labels == box_token_id] = -100
+                labels[labels == endofattr_token_id] = -100
+                # labels[endofattr_next_token_index] = -100
+            labels[:, 0] = -100
+            answer_token_id = tokenizer(" Answer").input_ids[0]
+            answer_token_loc = (input_ids == answer_token_id).nonzero()
+            for batch_idx, idx in answer_token_loc:
+                labels[batch_idx][:idx+2] = -100
+            with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=labels,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=added_bbox,
+                    add_box=added_bbox is not None,
+                )
+                loss_total = outputs.loss.reshape(labels.shape[0], -1)
+                loss = loss_total.sum() / (loss_total != 0).sum()
+                losses.append(loss.item())
+        pred_scores.append(np.exp(-np.array(losses)) / np.exp(-np.array(losses)).sum())
+        gt.append(rel_type_to_id[gt_relation])
+    gt = np.array(gt)
+    pred_scores = np.array(pred_scores)
+    pred = pred_scores.argmax(1)
+    print("total num:", len(gt))
+    recalls = recall_score(y_true=gt, y_pred=pred, average=None, labels=[0,1,2,3,4,5])
+    print("recalls:", recalls)
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([gt.tolist(), pred.tolist()]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        gt = []
+        pred = []
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [gt_part, pred_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            gt.extend(gt_part)
+            pred.extend(pred_part)
+        print("total num:", len(gt))
+        recalls = recall_score(y_true=gt, y_pred=pred, average=None, labels=[0,1,2,3,4,5])
+        print("recalls:", recalls)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}"), "w") as f:
+            f.write(f"{gt}\n")
+            f.write(f"{pred}\n")
+            f.write(f"{recalls}\n")
+    score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+if __name__ == "__main__":
+    main()

multimodal/build/lib/open_flamingo/eval/imagenet_utils.py ADDED Viewed

	@@ -0,0 +1,1007 @@

+# classnames via https://github.com/mlfoundations/wise-ft/blob/master/src/datasets/imagenet_classnames.py#L1
+openai_imagenet_classnames = [
+    "tench",
+    "goldfish",
+    "great white shark",
+    "tiger shark",
+    "hammerhead shark",
+    "electric ray",
+    "stingray",
+    "rooster",
+    "hen",
+    "ostrich",
+    "brambling",
+    "goldfinch",
+    "house finch",
+    "junco",
+    "indigo bunting",
+    "American robin",
+    "bulbul",
+    "jay",
+    "magpie",
+    "chickadee",
+    "American dipper",
+    "kite (bird of prey)",
+    "bald eagle",
+    "vulture",
+    "great grey owl",
+    "fire salamander",
+    "smooth newt",
+    "newt",
+    "spotted salamander",
+    "axolotl",
+    "American bullfrog",
+    "tree frog",
+    "tailed frog",
+    "loggerhead sea turtle",
+    "leatherback sea turtle",
+    "mud turtle",
+    "terrapin",
+    "box turtle",
+    "banded gecko",
+    "green iguana",
+    "Carolina anole",
+    "desert grassland whiptail lizard",
+    "agama",
+    "frilled-necked lizard",
+    "alligator lizard",
+    "Gila monster",
+    "European green lizard",
+    "chameleon",
+    "Komodo dragon",
+    "Nile crocodile",
+    "American alligator",
+    "triceratops",
+    "worm snake",
+    "ring-necked snake",
+    "eastern hog-nosed snake",
+    "smooth green snake",
+    "kingsnake",
+    "garter snake",
+    "water snake",
+    "vine snake",
+    "night snake",
+    "boa constrictor",
+    "African rock python",
+    "Indian cobra",
+    "green mamba",
+    "sea snake",
+    "Saharan horned viper",
+    "eastern diamondback rattlesnake",
+    "sidewinder rattlesnake",
+    "trilobite",
+    "harvestman",
+    "scorpion",
+    "yellow garden spider",
+    "barn spider",
+    "European garden spider",
+    "southern black widow",
+    "tarantula",
+    "wolf spider",
+    "tick",
+    "centipede",
+    "black grouse",
+    "ptarmigan",
+    "ruffed grouse",
+    "prairie grouse",
+    "peafowl",
+    "quail",
+    "partridge",
+    "african grey parrot",
+    "macaw",
+    "sulphur-crested cockatoo",
+    "lorikeet",
+    "coucal",
+    "bee eater",
+    "hornbill",
+    "hummingbird",
+    "jacamar",
+    "toucan",
+    "duck",
+    "red-breasted merganser",
+    "goose",
+    "black swan",
+    "tusker",
+    "echidna",
+    "platypus",
+    "wallaby",
+    "koala",
+    "wombat",
+    "jellyfish",
+    "sea anemone",
+    "brain coral",
+    "flatworm",
+    "nematode",
+    "conch",
+    "snail",
+    "slug",
+    "sea slug",
+    "chiton",
+    "chambered nautilus",
+    "Dungeness crab",
+    "rock crab",
+    "fiddler crab",
+    "red king crab",
+    "American lobster",
+    "spiny lobster",
+    "crayfish",
+    "hermit crab",
+    "isopod",
+    "white stork",
+    "black stork",
+    "spoonbill",
+    "flamingo",
+    "little blue heron",
+    "great egret",
+    "bittern bird",
+    "crane bird",
+    "limpkin",
+    "common gallinule",
+    "American coot",
+    "bustard",
+    "ruddy turnstone",
+    "dunlin",
+    "common redshank",
+    "dowitcher",
+    "oystercatcher",
+    "pelican",
+    "king penguin",
+    "albatross",
+    "grey whale",
+    "killer whale",
+    "dugong",
+    "sea lion",
+    "Chihuahua",
+    "Japanese Chin",
+    "Maltese",
+    "Pekingese",
+    "Shih Tzu",
+    "King Charles Spaniel",
+    "Papillon",
+    "toy terrier",
+    "Rhodesian Ridgeback",
+    "Afghan Hound",
+    "Basset Hound",
+    "Beagle",
+    "Bloodhound",
+    "Bluetick Coonhound",
+    "Black and Tan Coonhound",
+    "Treeing Walker Coonhound",
+    "English foxhound",
+    "Redbone Coonhound",
+    "borzoi",
+    "Irish Wolfhound",
+    "Italian Greyhound",
+    "Whippet",
+    "Ibizan Hound",
+    "Norwegian Elkhound",
+    "Otterhound",
+    "Saluki",
+    "Scottish Deerhound",
+    "Weimaraner",
+    "Staffordshire Bull Terrier",
+    "American Staffordshire Terrier",
+    "Bedlington Terrier",
+    "Border Terrier",
+    "Kerry Blue Terrier",
+    "Irish Terrier",
+    "Norfolk Terrier",
+    "Norwich Terrier",
+    "Yorkshire Terrier",
+    "Wire Fox Terrier",
+    "Lakeland Terrier",
+    "Sealyham Terrier",
+    "Airedale Terrier",
+    "Cairn Terrier",
+    "Australian Terrier",
+    "Dandie Dinmont Terrier",
+    "Boston Terrier",
+    "Miniature Schnauzer",
+    "Giant Schnauzer",
+    "Standard Schnauzer",
+    "Scottish Terrier",
+    "Tibetan Terrier",
+    "Australian Silky Terrier",
+    "Soft-coated Wheaten Terrier",
+    "West Highland White Terrier",
+    "Lhasa Apso",
+    "Flat-Coated Retriever",
+    "Curly-coated Retriever",
+    "Golden Retriever",
+    "Labrador Retriever",
+    "Chesapeake Bay Retriever",
+    "German Shorthaired Pointer",
+    "Vizsla",
+    "English Setter",
+    "Irish Setter",
+    "Gordon Setter",
+    "Brittany dog",
+    "Clumber Spaniel",
+    "English Springer Spaniel",
+    "Welsh Springer Spaniel",
+    "Cocker Spaniel",
+    "Sussex Spaniel",
+    "Irish Water Spaniel",
+    "Kuvasz",
+    "Schipperke",
+    "Groenendael dog",
+    "Malinois",
+    "Briard",
+    "Australian Kelpie",
+    "Komondor",
+    "Old English Sheepdog",
+    "Shetland Sheepdog",
+    "collie",
+    "Border Collie",
+    "Bouvier des Flandres dog",
+    "Rottweiler",
+    "German Shepherd Dog",
+    "Dobermann",
+    "Miniature Pinscher",
+    "Greater Swiss Mountain Dog",
+    "Bernese Mountain Dog",
+    "Appenzeller Sennenhund",
+    "Entlebucher Sennenhund",
+    "Boxer",
+    "Bullmastiff",
+    "Tibetan Mastiff",
+    "French Bulldog",
+    "Great Dane",
+    "St. Bernard",
+    "husky",
+    "Alaskan Malamute",
+    "Siberian Husky",
+    "Dalmatian",
+    "Affenpinscher",
+    "Basenji",
+    "pug",
+    "Leonberger",
+    "Newfoundland dog",
+    "Great Pyrenees dog",
+    "Samoyed",
+    "Pomeranian",
+    "Chow Chow",
+    "Keeshond",
+    "brussels griffon",
+    "Pembroke Welsh Corgi",
+    "Cardigan Welsh Corgi",
+    "Toy Poodle",
+    "Miniature Poodle",
+    "Standard Poodle",
+    "Mexican hairless dog (xoloitzcuintli)",
+    "grey wolf",
+    "Alaskan tundra wolf",
+    "red wolf or maned wolf",
+    "coyote",
+    "dingo",
+    "dhole",
+    "African wild dog",
+    "hyena",
+    "red fox",
+    "kit fox",
+    "Arctic fox",
+    "grey fox",
+    "tabby cat",
+    "tiger cat",
+    "Persian cat",
+    "Siamese cat",
+    "Egyptian Mau",
+    "cougar",
+    "lynx",
+    "leopard",
+    "snow leopard",
+    "jaguar",
+    "lion",
+    "tiger",
+    "cheetah",
+    "brown bear",
+    "American black bear",
+    "polar bear",
+    "sloth bear",
+    "mongoose",
+    "meerkat",
+    "tiger beetle",
+    "ladybug",
+    "ground beetle",
+    "longhorn beetle",
+    "leaf beetle",
+    "dung beetle",
+    "rhinoceros beetle",
+    "weevil",
+    "fly",
+    "bee",
+    "ant",
+    "grasshopper",
+    "cricket insect",
+    "stick insect",
+    "cockroach",
+    "praying mantis",
+    "cicada",
+    "leafhopper",
+    "lacewing",
+    "dragonfly",
+    "damselfly",
+    "red admiral butterfly",
+    "ringlet butterfly",
+    "monarch butterfly",
+    "small white butterfly",
+    "sulphur butterfly",
+    "gossamer-winged butterfly",
+    "starfish",
+    "sea urchin",
+    "sea cucumber",
+    "cottontail rabbit",
+    "hare",
+    "Angora rabbit",
+    "hamster",
+    "porcupine",
+    "fox squirrel",
+    "marmot",
+    "beaver",
+    "guinea pig",
+    "common sorrel horse",
+    "zebra",
+    "pig",
+    "wild boar",
+    "warthog",
+    "hippopotamus",
+    "ox",
+    "water buffalo",
+    "bison",
+    "ram (adult male sheep)",
+    "bighorn sheep",
+    "Alpine ibex",
+    "hartebeest",
+    "impala (antelope)",
+    "gazelle",
+    "arabian camel",
+    "llama",
+    "weasel",
+    "mink",
+    "European polecat",
+    "black-footed ferret",
+    "otter",
+    "skunk",
+    "badger",
+    "armadillo",
+    "three-toed sloth",
+    "orangutan",
+    "gorilla",
+    "chimpanzee",
+    "gibbon",
+    "siamang",
+    "guenon",
+    "patas monkey",
+    "baboon",
+    "macaque",
+    "langur",
+    "black-and-white colobus",
+    "proboscis monkey",
+    "marmoset",
+    "white-headed capuchin",
+    "howler monkey",
+    "titi monkey",
+    "Geoffroy's spider monkey",
+    "common squirrel monkey",
+    "ring-tailed lemur",
+    "indri",
+    "Asian elephant",
+    "African bush elephant",
+    "red panda",
+    "giant panda",
+    "snoek fish",
+    "eel",
+    "silver salmon",
+    "rock beauty fish",
+    "clownfish",
+    "sturgeon",
+    "gar fish",
+    "lionfish",
+    "pufferfish",
+    "abacus",
+    "abaya",
+    "academic gown",
+    "accordion",
+    "acoustic guitar",
+    "aircraft carrier",
+    "airliner",
+    "airship",
+    "altar",
+    "ambulance",
+    "amphibious vehicle",
+    "analog clock",
+    "apiary",
+    "apron",
+    "trash can",
+    "assault rifle",
+    "backpack",
+    "bakery",
+    "balance beam",
+    "balloon",
+    "ballpoint pen",
+    "Band-Aid",
+    "banjo",
+    "baluster / handrail",
+    "barbell",
+    "barber chair",
+    "barbershop",
+    "barn",
+    "barometer",
+    "barrel",
+    "wheelbarrow",
+    "baseball",
+    "basketball",
+    "bassinet",
+    "bassoon",
+    "swimming cap",
+    "bath towel",
+    "bathtub",
+    "station wagon",
+    "lighthouse",
+    "beaker",
+    "military hat (bearskin or shako)",
+    "beer bottle",
+    "beer glass",
+    "bell tower",
+    "baby bib",
+    "tandem bicycle",
+    "bikini",
+    "ring binder",
+    "binoculars",
+    "birdhouse",
+    "boathouse",
+    "bobsleigh",
+    "bolo tie",
+    "poke bonnet",
+    "bookcase",
+    "bookstore",
+    "bottle cap",
+    "hunting bow",
+    "bow tie",
+    "brass memorial plaque",
+    "bra",
+    "breakwater",
+    "breastplate",
+    "broom",
+    "bucket",
+    "buckle",
+    "bulletproof vest",
+    "high-speed train",
+    "butcher shop",
+    "taxicab",
+    "cauldron",
+    "candle",
+    "cannon",
+    "canoe",
+    "can opener",
+    "cardigan",
+    "car mirror",
+    "carousel",
+    "tool kit",
+    "cardboard box / carton",
+    "car wheel",
+    "automated teller machine",
+    "cassette",
+    "cassette player",
+    "castle",
+    "catamaran",
+    "CD player",
+    "cello",
+    "mobile phone",
+    "chain",
+    "chain-link fence",
+    "chain mail",
+    "chainsaw",
+    "storage chest",
+    "chiffonier",
+    "bell or wind chime",
+    "china cabinet",
+    "Christmas stocking",
+    "church",
+    "movie theater",
+    "cleaver",
+    "cliff dwelling",
+    "cloak",
+    "clogs",
+    "cocktail shaker",
+    "coffee mug",
+    "coffeemaker",
+    "spiral or coil",
+    "combination lock",
+    "computer keyboard",
+    "candy store",
+    "container ship",
+    "convertible",
+    "corkscrew",
+    "cornet",
+    "cowboy boot",
+    "cowboy hat",
+    "cradle",
+    "construction crane",
+    "crash helmet",
+    "crate",
+    "infant bed",
+    "Crock Pot",
+    "croquet ball",
+    "crutch",
+    "cuirass",
+    "dam",
+    "desk",
+    "desktop computer",
+    "rotary dial telephone",
+    "diaper",
+    "digital clock",
+    "digital watch",
+    "dining table",
+    "dishcloth",
+    "dishwasher",
+    "disc brake",
+    "dock",
+    "dog sled",
+    "dome",
+    "doormat",
+    "drilling rig",
+    "drum",
+    "drumstick",
+    "dumbbell",
+    "Dutch oven",
+    "electric fan",
+    "electric guitar",
+    "electric locomotive",
+    "entertainment center",
+    "envelope",
+    "espresso machine",
+    "face powder",
+    "feather boa",
+    "filing cabinet",
+    "fireboat",
+    "fire truck",
+    "fire screen",
+    "flagpole",
+    "flute",
+    "folding chair",
+    "football helmet",
+    "forklift",
+    "fountain",
+    "fountain pen",
+    "four-poster bed",
+    "freight car",
+    "French horn",
+    "frying pan",
+    "fur coat",
+    "garbage truck",
+    "gas mask or respirator",
+    "gas pump",
+    "goblet",
+    "go-kart",
+    "golf ball",
+    "golf cart",
+    "gondola",
+    "gong",
+    "gown",
+    "grand piano",
+    "greenhouse",
+    "radiator grille",
+    "grocery store",
+    "guillotine",
+    "hair clip",
+    "hair spray",
+    "half-track",
+    "hammer",
+    "hamper",
+    "hair dryer",
+    "hand-held computer",
+    "handkerchief",
+    "hard disk drive",
+    "harmonica",
+    "harp",
+    "combine harvester",
+    "hatchet",
+    "holster",
+    "home theater",
+    "honeycomb",
+    "hook",
+    "hoop skirt",
+    "gymnastic horizontal bar",
+    "horse-drawn vehicle",
+    "hourglass",
+    "iPod",
+    "clothes iron",
+    "carved pumpkin",
+    "jeans",
+    "jeep",
+    "T-shirt",
+    "jigsaw puzzle",
+    "rickshaw",
+    "joystick",
+    "kimono",
+    "knee pad",
+    "knot",
+    "lab coat",
+    "ladle",
+    "lampshade",
+    "laptop computer",
+    "lawn mower",
+    "lens cap",
+    "letter opener",
+    "library",
+    "lifeboat",
+    "lighter",
+    "limousine",
+    "ocean liner",
+    "lipstick",
+    "slip-on shoe",
+    "lotion",
+    "music speaker",
+    "loupe magnifying glass",
+    "sawmill",
+    "magnetic compass",
+    "messenger bag",
+    "mailbox",
+    "tights",
+    "one-piece bathing suit",
+    "manhole cover",
+    "maraca",
+    "marimba",
+    "mask",
+    "matchstick",
+    "maypole",
+    "maze",
+    "measuring cup",
+    "medicine cabinet",
+    "megalith",
+    "microphone",
+    "microwave oven",
+    "military uniform",
+    "milk can",
+    "minibus",
+    "miniskirt",
+    "minivan",
+    "missile",
+    "mitten",
+    "mixing bowl",
+    "mobile home",
+    "ford model t",
+    "modem",
+    "monastery",
+    "monitor",
+    "moped",
+    "mortar and pestle",
+    "graduation cap",
+    "mosque",
+    "mosquito net",
+    "vespa",
+    "mountain bike",
+    "tent",
+    "computer mouse",
+    "mousetrap",
+    "moving van",
+    "muzzle",
+    "metal nail",
+    "neck brace",
+    "necklace",
+    "baby pacifier",
+    "notebook computer",
+    "obelisk",
+    "oboe",
+    "ocarina",
+    "odometer",
+    "oil filter",
+    "pipe organ",
+    "oscilloscope",
+    "overskirt",
+    "bullock cart",
+    "oxygen mask",
+    "product packet / packaging",
+    "paddle",
+    "paddle wheel",
+    "padlock",
+    "paintbrush",
+    "pajamas",
+    "palace",
+    "pan flute",
+    "paper towel",
+    "parachute",
+    "parallel bars",
+    "park bench",
+    "parking meter",
+    "railroad car",
+    "patio",
+    "payphone",
+    "pedestal",
+    "pencil case",
+    "pencil sharpener",
+    "perfume",
+    "Petri dish",
+    "photocopier",
+    "plectrum",
+    "Pickelhaube",
+    "picket fence",
+    "pickup truck",
+    "pier",
+    "piggy bank",
+    "pill bottle",
+    "pillow",
+    "ping-pong ball",
+    "pinwheel",
+    "pirate ship",
+    "drink pitcher",
+    "block plane",
+    "planetarium",
+    "plastic bag",
+    "plate rack",
+    "farm plow",
+    "plunger",
+    "Polaroid camera",
+    "pole",
+    "police van",
+    "poncho",
+    "pool table",
+    "soda bottle",
+    "plant pot",
+    "potter's wheel",
+    "power drill",
+    "prayer rug",
+    "printer",
+    "prison",
+    "missile",
+    "projector",
+    "hockey puck",
+    "punching bag",
+    "purse",
+    "quill",
+    "quilt",
+    "race car",
+    "racket",
+    "radiator",
+    "radio",
+    "radio telescope",
+    "rain barrel",
+    "recreational vehicle",
+    "fishing casting reel",
+    "reflex camera",
+    "refrigerator",
+    "remote control",
+    "restaurant",
+    "revolver",
+    "rifle",
+    "rocking chair",
+    "rotisserie",
+    "eraser",
+    "rugby ball",
+    "ruler measuring stick",
+    "sneaker",
+    "safe",
+    "safety pin",
+    "salt shaker",
+    "sandal",
+    "sarong",
+    "saxophone",
+    "scabbard",
+    "weighing scale",
+    "school bus",
+    "schooner",
+    "scoreboard",
+    "CRT monitor",
+    "screw",
+    "screwdriver",
+    "seat belt",
+    "sewing machine",
+    "shield",
+    "shoe store",
+    "shoji screen / room divider",
+    "shopping basket",
+    "shopping cart",
+    "shovel",
+    "shower cap",
+    "shower curtain",
+    "ski",
+    "balaclava ski mask",
+    "sleeping bag",
+    "slide rule",
+    "sliding door",
+    "slot machine",
+    "snorkel",
+    "snowmobile",
+    "snowplow",
+    "soap dispenser",
+    "soccer ball",
+    "sock",
+    "solar thermal collector",
+    "sombrero",
+    "soup bowl",
+    "keyboard space bar",
+    "space heater",
+    "space shuttle",
+    "spatula",
+    "motorboat",
+    "spider web",
+    "spindle",
+    "sports car",
+    "spotlight",
+    "stage",
+    "steam locomotive",
+    "through arch bridge",
+    "steel drum",
+    "stethoscope",
+    "scarf",
+    "stone wall",
+    "stopwatch",
+    "stove",
+    "strainer",
+    "tram",
+    "stretcher",
+    "couch",
+    "stupa",
+    "submarine",
+    "suit",
+    "sundial",
+    "sunglasses",
+    "sunglasses",
+    "sunscreen",
+    "suspension bridge",
+    "mop",
+    "sweatshirt",
+    "swim trunks / shorts",
+    "swing",
+    "electrical switch",
+    "syringe",
+    "table lamp",
+    "tank",
+    "tape player",
+    "teapot",
+    "teddy bear",
+    "television",
+    "tennis ball",
+    "thatched roof",
+    "front curtain",
+    "thimble",
+    "threshing machine",
+    "throne",
+    "tile roof",
+    "toaster",
+    "tobacco shop",
+    "toilet seat",
+    "torch",
+    "totem pole",
+    "tow truck",
+    "toy store",
+    "tractor",
+    "semi-trailer truck",
+    "tray",
+    "trench coat",
+    "tricycle",
+    "trimaran",
+    "tripod",
+    "triumphal arch",
+    "trolleybus",
+    "trombone",
+    "hot tub",
+    "turnstile",
+    "typewriter keyboard",
+    "umbrella",
+    "unicycle",
+    "upright piano",
+    "vacuum cleaner",
+    "vase",
+    "vaulted or arched ceiling",
+    "velvet fabric",
+    "vending machine",
+    "vestment",
+    "viaduct",
+    "violin",
+    "volleyball",
+    "waffle iron",
+    "wall clock",
+    "wallet",
+    "wardrobe",
+    "military aircraft",
+    "sink",
+    "washing machine",
+    "water bottle",
+    "water jug",
+    "water tower",
+    "whiskey jug",
+    "whistle",
+    "hair wig",
+    "window screen",
+    "window shade",
+    "Windsor tie",
+    "wine bottle",
+    "airplane wing",
+    "wok",
+    "wooden spoon",
+    "wool",
+    "split-rail fence",
+    "shipwreck",
+    "sailboat",
+    "yurt",
+    "website",
+    "comic book",
+    "crossword",
+    "traffic or street sign",
+    "traffic light",
+    "dust jacket",
+    "menu",
+    "plate",
+    "guacamole",
+    "consomme",
+    "hot pot",
+    "trifle",
+    "ice cream",
+    "popsicle",
+    "baguette",
+    "bagel",
+    "pretzel",
+    "cheeseburger",
+    "hot dog",
+    "mashed potatoes",
+    "cabbage",
+    "broccoli",
+    "cauliflower",
+    "zucchini",
+    "spaghetti squash",
+    "acorn squash",
+    "butternut squash",
+    "cucumber",
+    "artichoke",
+    "bell pepper",
+    "cardoon",
+    "mushroom",
+    "Granny Smith apple",
+    "strawberry",
+    "orange",
+    "lemon",
+    "fig",
+    "pineapple",
+    "banana",
+    "jackfruit",
+    "cherimoya (custard apple)",
+    "pomegranate",
+    "hay",
+    "carbonara",
+    "chocolate syrup",
+    "dough",
+    "meatloaf",
+    "pizza",
+    "pot pie",
+    "burrito",
+    "red wine",
+    "espresso",
+    "tea cup",
+    "eggnog",
+    "mountain",
+    "bubble",
+    "cliff",
+    "coral reef",
+    "geyser",
+    "lakeshore",
+    "promontory",
+    "sandbar",
+    "beach",
+    "valley",
+    "volcano",
+    "baseball player",
+    "bridegroom",
+    "scuba diver",
+    "rapeseed",
+    "daisy",
+    "yellow lady's slipper",
+    "corn",
+    "acorn",
+    "rose hip",
+    "horse chestnut seed",
+    "coral fungus",
+    "agaric",
+    "gyromitra",
+    "stinkhorn mushroom",
+    "earth star fungus",
+    "hen of the woods mushroom",
+    "bolete",
+    "corn cob",
+    "toilet paper",
+]
+# Maps numeric class ids to labels
+IMAGENET_1K_CLASS_ID_TO_LABEL = dict(
+    zip(range(len(openai_imagenet_classnames)), openai_imagenet_classnames)
+)

multimodal/build/lib/open_flamingo/eval/ok_vqa_utils.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# Those are manual mapping that are not caught by our stemming rules or would
+# would be done incorrectly by our automatic stemming rule. In details,
+# the keys of the _MANUAL_MATCHES dict contains the original word and the value
+# contains the transformation of the word expected by the OKVQA stemming rule.
+# These manual rules were found by checking the `raw_answers` and the `answers`
+# fields of the released OKVQA dataset and checking all things that were not
+# properly mapped by our automatic rules. In particular some of the mapping
+# are sometimes constant, e.g. christmas -> christmas which was incorrectly
+# singularized by our inflection.singularize.
+import re
+import nltk
+from nltk.corpus.reader import VERB
+import inflection
+_MANUAL_MATCHES = {
+    "police": "police",
+    "las": "las",
+    "vegas": "vegas",
+    "yes": "yes",
+    "jeans": "jean",
+    "hell's": "hell",
+    "domino's": "domino",
+    "morning": "morn",
+    "clothes": "cloth",
+    "are": "are",
+    "riding": "ride",
+    "leaves": "leaf",
+    "dangerous": "danger",
+    "clothing": "cloth",
+    "texting": "text",
+    "kiting": "kite",
+    "firefighters": "firefight",
+    "ties": "tie",
+    "married": "married",
+    "teething": "teeth",
+    "gloves": "glove",
+    "tennis": "tennis",
+    "dining": "dine",
+    "directions": "direct",
+    "waves": "wave",
+    "christmas": "christmas",
+    "drives": "drive",
+    "pudding": "pud",
+    "coding": "code",
+    "plating": "plate",
+    "quantas": "quanta",
+    "hornes": "horn",
+    "graves": "grave",
+    "mating": "mate",
+    "paned": "pane",
+    "alertness": "alert",
+    "sunbathing": "sunbath",
+    "tenning": "ten",
+    "wetness": "wet",
+    "urinating": "urine",
+    "sickness": "sick",
+    "braves": "brave",
+    "firefighting": "firefight",
+    "lenses": "lens",
+    "reflections": "reflect",
+    "backpackers": "backpack",
+    "eatting": "eat",
+    "designers": "design",
+    "curiousity": "curious",
+    "playfulness": "play",
+    "blindness": "blind",
+    "hawke": "hawk",
+    "tomatoe": "tomato",
+    "rodeoing": "rodeo",
+    "brightness": "bright",
+    "circuses": "circus",
+    "skateboarders": "skateboard",
+    "staring": "stare",
+    "electronics": "electron",
+    "electicity": "elect",
+    "mountainous": "mountain",
+    "socializing": "social",
+    "hamburgers": "hamburg",
+    "caves": "cave",
+    "transitions": "transit",
+    "wading": "wade",
+    "creame": "cream",
+    "toileting": "toilet",
+    "sautee": "saute",
+    "buildings": "build",
+    "belongings": "belong",
+    "stockings": "stock",
+    "walle": "wall",
+    "cumulis": "cumuli",
+    "travelers": "travel",
+    "conducter": "conduct",
+    "browsing": "brows",
+    "pooping": "poop",
+    "haircutting": "haircut",
+    "toppings": "top",
+    "hearding": "heard",
+    "sunblocker": "sunblock",
+    "bases": "base",
+    "markings": "mark",
+    "mopeds": "mope",
+    "kindergartener": "kindergarten",
+    "pies": "pie",
+    "scrapbooking": "scrapbook",
+    "couponing": "coupon",
+    "meetings": "meet",
+    "elevators": "elev",
+    "lowes": "low",
+    "men's": "men",
+    "childrens": "children",
+    "shelves": "shelve",
+    "paintings": "paint",
+    "raines": "rain",
+    "paring": "pare",
+    "expressions": "express",
+    "routes": "rout",
+    "pease": "peas",
+    "vastness": "vast",
+    "awning": "awn",
+    "boy's": "boy",
+    "drunkenness": "drunken",
+    "teasing": "teas",
+    "conferences": "confer",
+    "ripeness": "ripe",
+    "suspenders": "suspend",
+    "earnings": "earn",
+    "reporters": "report",
+    "kid's": "kid",
+    "containers": "contain",
+    "corgie": "corgi",
+    "porche": "porch",
+    "microwaves": "microwave",
+    "batter's": "batter",
+    "sadness": "sad",
+    "apartments": "apart",
+    "oxygenize": "oxygen",
+    "striping": "stripe",
+    "purring": "pure",
+    "professionals": "profession",
+    "piping": "pipe",
+    "farmer's": "farmer",
+    "potatoe": "potato",
+    "emirates": "emir",
+    "womens": "women",
+    "veteran's": "veteran",
+    "wilderness": "wilder",
+    "propellers": "propel",
+    "alpes": "alp",
+    "charioteering": "chariot",
+    "swining": "swine",
+    "illness": "ill",
+    "crepte": "crept",
+    "adhesives": "adhesive",
+    "regent's": "regent",
+    "decorations": "decor",
+    "rabbies": "rabbi",
+    "overseas": "oversea",
+    "travellers": "travel",
+    "casings": "case",
+    "smugness": "smug",
+    "doves": "dove",
+    "nationals": "nation",
+    "mustange": "mustang",
+    "ringe": "ring",
+    "gondoliere": "gondolier",
+    "vacationing": "vacate",
+    "reminders": "remind",
+    "baldness": "bald",
+    "settings": "set",
+    "glaced": "glace",
+    "coniferous": "conifer",
+    "revelations": "revel",
+    "personals": "person",
+    "daughter's": "daughter",
+    "badness": "bad",
+    "projections": "project",
+    "polarizing": "polar",
+    "vandalizers": "vandal",
+    "minerals": "miner",
+    "protesters": "protest",
+    "controllers": "control",
+    "weddings": "wed",
+    "sometimes": "sometime",
+    "earing": "ear",
+}
+class OKVQAStemmer:
+    """Stemmer to match OKVQA v1.1 procedure."""
+    def __init__(self):
+        self._wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
+    def stem(self, input_string):
+        """Apply stemming."""
+        word_and_pos = nltk.pos_tag(nltk.tokenize.word_tokenize(input_string))
+        stemmed_words = []
+        for w, p in word_and_pos:
+            if w in _MANUAL_MATCHES:
+                w = _MANUAL_MATCHES[w]
+            elif w.endswith("ing"):
+                w = self._wordnet_lemmatizer.lemmatize(w, VERB)
+            elif p.startswith("NNS") or p.startswith("NNPS"):
+                w = inflection.singularize(w)
+            stemmed_words.append(w)
+        return " ".join(stemmed_words)
+stemmer = OKVQAStemmer()
+def postprocess_ok_vqa_generation(prediction) -> str:
+    prediction_stem = stemmer.stem(prediction)
+    return prediction_stem

multimodal/build/lib/open_flamingo/eval/task/__init__.py ADDED Viewed

File without changes

multimodal/build/lib/open_flamingo/eval/task/caption.py ADDED Viewed

	@@ -0,0 +1,419 @@

+from lavis.datasets.builders import load_dataset
+import torch
+import more_itertools
+from tqdm import tqdm
+from coco_metric import compute_cider, postprocess_captioning_generation
+import json
+import time
+import os
+from transformers import LogitsProcessor, MinNewTokensLengthLogitsProcessor, ForcedEOSTokenLogitsProcessor
+from PIL import Image
+class VisualLogitsProcessor(LogitsProcessor):
+    def __init__(self, tokenizer):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.object_token_id = self.tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1]
+        self.prebox_token_id = self.tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+        self.box_token_id = self.tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+        self.previsual_token_id = self.tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+        self.visual_token_id = self.tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+        self.eos_token_id = self.tokenizer.encode(self.tokenizer.eos_token)[-1]
+        self.endofobject_token_id = self.tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+        self.topk = 2
+    def __call__(self, input_ids, scores):
+        # print("decoding===>", self.tokenizer.decode(scores.sort(descending=True).indices.tolist()[0][:self.topk]))
+        # import pdb; pdb.set_trace()
+        if self.object_token_id in scores.sort(descending=True).indices.tolist()[0][1:self.topk] and self.eos_token_id not in scores.sort(descending=True).indices.tolist()[0][:self.topk] and (input_ids == self.object_token_id).sum() * 2 == (input_ids == self.endofobject_token_id).sum():
+            scores[0, self.object_token_id] = 1000
+        if input_ids[0, -1] == self.object_token_id and input_ids[0, -2] != self.prebox_token_id:
+            if (input_ids[0, :-1] == self.object_token_id).sum() != 0:
+                # print("generate a previsual token next")
+                scores[0, self.previsual_token_id] = 1000
+        elif input_ids[0, -1] == self.previsual_token_id or input_ids[0, -1] == self.visual_token_id:
+            # print("stop to run bbox generation for " + "previsual" if input_ids[0, -1] == self.previsual_token_id else "visual")
+            scores[0, self.eos_token_id] = 1000
+        elif input_ids[0, -1] == self.endofobject_token_id and input_ids[0, -2] != self.box_token_id:
+            # print("generate a visual token next")
+            scores[0, self.visual_token_id] = 1000
+        return scores
+def prepare_batch_images(batch, image_processor):
+    batch_images = None
+    for b in batch:
+        b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        if batch_images is None:
+            batch_images = b_image
+        else:
+            batch_images = torch.cat([batch_images, b_image], dim=0)
+    return batch_images
+def captioner(
+    model,tokenizer,image_ori,batch_images,input_ids,attention_mask,image_start_index_list,image_nums,added_bbox_list,debug=False):
+    """Evaluate a model on COCO dataset.
+    Returns:
+        float: CIDEr score
+    """
+    visual_logits_processor = VisualLogitsProcessor(tokenizer)
+    model.eval()
+    # model.eval().cuda()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+    bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token = "<|#box#|>"
+    prebox_token = "<|#prebox#|>"
+    endofobject_token = "<|#endofobject#|>"
+    object_token = "<|#object#|>"
+    ori_prompt_length = len(input_ids[0])
+    have_prebox = False
+    out_image = None
+    while True:
+        batch_images = batch_images
+        input_ids = input_ids
+        attention_mask = attention_mask
+        image_start_index_list = image_start_index_list
+        image_nums = image_nums
+        if debug:
+            print("input--->",tokenizer.decode(input_ids[0]))
+        p1 = MinNewTokensLengthLogitsProcessor(
+            prompt_length_to_skip=input_ids.shape[-1],
+            min_new_tokens=5,
+            eos_token_id=bos_token_id,
+        )
+        with torch.inference_mode():
+            outputs = model.generate(
+                batch_images,
+                input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=20,
+                # min_new_tokens=8,
+                num_beams=1,
+                # length_penalty=0,
+                image_start_index_list=image_start_index_list,
+                image_nums=image_nums,
+                added_bbox_list=added_bbox_list if len(added_bbox_list) != 0 else None,
+                logits_processor_list=[p1, visual_logits_processor],
+            )
+        if debug:
+            print("outputs--->",tokenizer.decode(outputs[0]))
+        if outputs[0, -2] in [previsual_token_id, visual_token_id] and outputs[0, -1] == bos_token_id:
+            prompt = tokenizer.decode(outputs.clone()[0])
+            is_visual = (outputs[0, -2] == visual_token_id)
+            batch_text = tokenizer.batch_decode(outputs[:, :-1])
+            encodings = tokenizer(
+                batch_text,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=2000,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            if debug:
+                print("get the visual bbox--->",tokenizer.decode(input_ids[0]))
+            with torch.no_grad():
+                outputs = model(
+                    vision_x=batch_images,
+                    lang_x=input_ids,
+                    attention_mask=attention_mask,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=added_bbox_list if len(added_bbox_list) != 0 else None,
+                    add_box=added_bbox_list is not None and len(added_bbox_list) != 0,
+                )
+            boxes = outputs["boxes"]
+            scores = outputs["scores"]
+            # if not model.valid:
+            #     import pdb; pdb.set_trace()
+            if boxes is not None:
+                if is_visual:
+                    if have_prebox:
+                        added_bbox_list.pop()
+                        prompt = prompt.replace("<|#previsual#|><|#prebox#|><|#object#|>", "")
+                        have_prebox = False
+                        if debug:
+                            print("find previsual and remove it--->", prompt)
+                    first_box = boxes[scores.argmax()]
+                    added_bbox_list += [torch.tensor(first_box).unsqueeze(0) / 224]
+                    prompt = prompt[:-len(tokenizer.eos_token)]
+                    prompt += box_token + endofobject_token
+                    if debug:
+                        print("after inserting visual---->", prompt)
+                else:
+                    import numpy as np
+                    import cv2
+                    open_cv_image = np.array(image_ori)
+                    open_cv_image = open_cv_image[:, :, ::-1].copy()
+                    for i, pre_box in enumerate(boxes):
+                        open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 255, 0), i+1)
+                    out_image = Image.fromarray(cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB))
+                    # exit()
+                    pre_box = boxes[scores.argmax()]
+                    added_bbox_list += [torch.tensor(pre_box).unsqueeze(0).cuda() / 224]
+                    prompt = prompt[:-len(tokenizer.eos_token)]
+                    prompt += prebox_token + object_token
+                    have_prebox = True
+                    if debug:
+                        print("after inserting previsual---->", prompt)
+            else:
+                if debug:
+                    import pdb;pdb.set_trace()
+                prompt = tokenizer.decode(outputs[0, :-2].clone()[0])
+        else:
+            break
+    outputs = outputs[:, ori_prompt_length:]
+    outputs = postprocess_captioning_generation(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]).replace('"', "")
+    # new_predictions = [
+    #     postprocess_captioning_generation(out).replace('"', "")
+    #     for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    # ]
+        # import pdb; pdb.set_trace()
+    return outputs, out_image
+def evaluate_coco_flickr(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    is_flickr=False,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    debug=False,
+):
+    """Evaluate a model on COCO dataset.
+    Returns:
+        float: CIDEr score
+    """
+    visual_logits_processor = VisualLogitsProcessor(tokenizer)
+    coco_dataset = load_dataset("coco_caption")
+    eval_dataset = coco_dataset["test"]
+    model.eval().cuda()
+    predictions = dict()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+    bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token = "<|#box#|>"
+    prebox_token = "<|#prebox#|>"
+    endofobject_token = "<|#endofobject#|>"
+    object_token = "<|#object#|>"
+    cnt = 0
+    if world_size > 1:
+        torch.distributed.barrier()
+    desc = "Running inference Flickr30" if is_flickr else "Running inference COCO"
+    for ii, batch in enumerate(more_itertools.chunked(
+        tqdm(eval_dataset, desc=desc, disable=(rank != 0)), batch_size
+    )):
+        if ii % world_size != rank:
+            continue
+        cnt += len(batch)
+        batch[0]["image"] = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/images/img3.jpg").resize((224, 224))
+        batch_images = prepare_batch_images(
+            batch=batch,
+            image_processor=image_processor,
+        ).cuda()
+        prompt = f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"
+        added_bbox_list = []
+        batch_text = [prompt for _ in batch]
+        encodings = tokenizer(
+            batch_text,
+            padding="longest",
+            truncation=True,
+            return_tensors="pt",
+            max_length=2000,
+        )
+        ori_prompt_length = len(encodings["input_ids"][0])
+        have_prebox = False
+        while True:
+            batch_text = [prompt for _ in batch]
+            encodings = tokenizer(
+                batch_text,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=2000,
+            )
+            input_ids = encodings["input_ids"].cuda()
+            attention_mask = encodings["attention_mask"].cuda()
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            if debug:
+                print("input--->",tokenizer.decode(input_ids[0]))
+            p1 = MinNewTokensLengthLogitsProcessor(
+                prompt_length_to_skip=input_ids.shape[-1],
+                min_new_tokens=5,
+                eos_token_id=bos_token_id,
+            )
+            with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+                outputs = model.generate(
+                    batch_images,
+                    input_ids,
+                    attention_mask=attention_mask,
+                    max_new_tokens=20,
+                    # min_new_tokens=8,
+                    num_beams=1,
+                    # length_penalty=0,
+                    image_start_index_list=image_start_index_list,
+                    image_nums=image_nums,
+                    added_bbox_list=added_bbox_list if len(added_bbox_list) != 0 else None,
+                    logits_processor_list=[p1, visual_logits_processor],
+                )
+            if debug:
+                print("outputs--->",tokenizer.decode(outputs[0]))
+            if outputs[0, -2] in [previsual_token_id, visual_token_id] and outputs[0, -1] == bos_token_id:
+                prompt = tokenizer.decode(outputs.clone()[0])
+                is_visual = (outputs[0, -2] == visual_token_id)
+                batch_text = tokenizer.batch_decode(outputs[:, :-1])
+                encodings = tokenizer(
+                    batch_text,
+                    padding="longest",
+                    truncation=True,
+                    return_tensors="pt",
+                    max_length=2000,
+                )
+                input_ids = encodings["input_ids"].cuda()
+                attention_mask = encodings["attention_mask"].cuda()
+                image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+                image_start_index_list = [[x] for x in image_start_index_list]
+                image_nums = [1] * len(input_ids)
+                if debug:
+                    print("get the visual bbox--->",tokenizer.decode(input_ids[0]))
+                with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                    outputs = model(
+                        vision_x=batch_images,
+                        lang_x=input_ids,
+                        attention_mask=attention_mask,
+                        image_nums=image_nums,
+                        image_start_index_list=image_start_index_list,
+                        added_bbox_list=added_bbox_list if len(added_bbox_list) != 0 else None,
+                        add_box=added_bbox_list is not None and len(added_bbox_list) != 0,
+                    )
+                boxes = outputs["boxes"]
+                scores = outputs["scores"]
+                # if not model.valid:
+                #     import pdb; pdb.set_trace()
+                if boxes is not None:
+                    if is_visual:
+                        if have_prebox:
+                            added_bbox_list.pop()
+                            prompt = prompt.replace("<|#previsual#|><|#prebox#|><|#object#|>", "")
+                            have_prebox = False
+                            if debug:
+                                print("find previsual and remove it--->", prompt)
+                        first_box = boxes[scores.argmax()]
+                        added_bbox_list += [torch.tensor(first_box).unsqueeze(0).cuda() / 224]
+                        prompt = prompt[:-len(tokenizer.eos_token)]
+                        prompt += box_token + endofobject_token
+                        if debug:
+                            print("after inserting visual---->", prompt)
+                    else:
+                        import numpy as np
+                        import cv2
+                        open_cv_image = np.array(batch[0]["image"])
+                        open_cv_image = open_cv_image[:, :, ::-1].copy()
+                        for i, pre_box in enumerate(boxes):
+                            open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 255, 0), i+1)
+                        cv2.imwrite("Atest.png", open_cv_image)
+                        exit()
+                        pre_box = boxes[scores.argmax()]
+                        added_bbox_list += [torch.tensor(pre_box).unsqueeze(0).cuda() / 224]
+                        prompt = prompt[:-len(tokenizer.eos_token)]
+                        prompt += prebox_token + object_token
+                        have_prebox = True
+                        if debug:
+                            print("after inserting previsual---->", prompt)
+                else:
+                    import pdb;pdb.set_trace()
+                    prompt = tokenizer.decode(outputs[0, :-2].clone()[0])
+            else:
+                break
+        outputs = outputs[:, ori_prompt_length:]
+        new_predictions = [
+            postprocess_captioning_generation(out).replace('"', "")
+            for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ]
+        # import pdb; pdb.set_trace()
+        if rank == 0:
+            tqdm.write(new_predictions[0])
+        for i, sample in enumerate(batch):
+            predictions[int(sample["image_id"])] = {
+                "caption": new_predictions[i],
+            }
+        print(new_predictions)
+        exit()
+    results_path = (
+        f"flickrresults_{lang_encoder_name}_{rank}_{id}.json"
+        if is_flickr
+        else f"cocoresults_{lang_encoder_name}_{rank}_{id}.json"
+    )
+    with open(results_path, "w") as f:
+        f.write(
+            json.dumps(
+                [
+                    {"image_id": k, "caption": predictions[k]["caption"]}
+                    for k in predictions
+                ],
+                indent=2,
+            )
+        )
+    print("save to", results_path)
+    del predictions
+    time.sleep(10)
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        predictions = []
+        for rank_i in range(world_size):
+            part_results_path = (
+                f"flickrresults_{lang_encoder_name}_{rank_i}_{id}.json"
+                if is_flickr
+                else f"cocoresults_{lang_encoder_name}_{rank_i}_{id}.json"
+            )
+            print("load", part_results_path)
+            predictions.extend(json.load(open(part_results_path)))
+            os.remove(part_results_path)
+        print("num:", len(predictions))
+        results_path = (
+            f"flickrresults_{lang_encoder_name}.json"
+            if is_flickr
+            else f"cocoresults_{lang_encoder_name}.json"
+        )
+        json.dump(predictions, open(results_path, "w"), indent=2)
+        metrics = compute_cider(
+            result_path=results_path,
+            annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/.cache/lavis/coco_gt/coco_karpathy_test_gt.json",
+        )
+        metrics["CIDEr"] *= 100
+        os.makedirs("eval_results", exist_ok=True)
+        acc = metrics["CIDEr"]
+        with open(os.path.join("eval_results", f"cococap_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
+            f.write(json.dumps(predictions, indent=2))
+        # delete the temporary file
+        os.remove(results_path)
+    else:
+        metrics = {}
+        metrics["CIDEr"] = 0.0
+    return metrics["CIDEr"]

multimodal/build/lib/open_flamingo/eval/task/caption_chat.py ADDED Viewed

	@@ -0,0 +1,417 @@

+import torch
+import more_itertools
+from tqdm import tqdm
+import json
+import time
+import os
+from transformers import LogitsProcessor, MinNewTokensLengthLogitsProcessor, ForcedEOSTokenLogitsProcessor
+from PIL import Image
+class VisualLogitsProcessor(LogitsProcessor):
+    def __init__(self, tokenizer):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.object_token_id = self.tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1]
+        self.prebox_token_id = self.tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+        self.box_token_id = self.tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+        self.previsual_token_id = self.tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+        self.visual_token_id = self.tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+        self.eos_token_id = self.tokenizer.encode(self.tokenizer.eos_token)[-1]
+        self.endofobject_token_id = self.tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+        self.topk = 2
+    def __call__(self, input_ids, scores):
+        # print("decoding===>", self.tokenizer.decode(scores.sort(descending=True).indices.tolist()[0][:self.topk]))
+        # import pdb; pdb.set_trace()
+        if self.object_token_id in scores.sort(descending=True).indices.tolist()[0][1:self.topk] and self.eos_token_id not in scores.sort(descending=True).indices.tolist()[0][:self.topk] and (input_ids == self.object_token_id).sum() * 2 == (input_ids == self.endofobject_token_id).sum():
+            scores[0, self.object_token_id] = 1000
+        if input_ids[0, -1] == self.object_token_id and input_ids[0, -2] != self.prebox_token_id:
+            if (input_ids[0, :-1] == self.object_token_id).sum() != 0:
+                # print("generate a previsual token next")
+                scores[0, self.previsual_token_id] = 1000
+        elif input_ids[0, -1] == self.previsual_token_id or input_ids[0, -1] == self.visual_token_id:
+            # print("stop to run bbox generation for " + "previsual" if input_ids[0, -1] == self.previsual_token_id else "visual")
+            scores[0, self.eos_token_id] = 1000
+        elif input_ids[0, -1] == self.endofobject_token_id and input_ids[0, -2] != self.box_token_id:
+            # print("generate a visual token next")
+            scores[0, self.visual_token_id] = 1000
+        return scores
+def prepare_batch_images(batch, image_processor):
+    batch_images = None
+    for b in batch:
+        b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        if batch_images is None:
+            batch_images = b_image
+        else:
+            batch_images = torch.cat([batch_images, b_image], dim=0)
+    return batch_images
+def captioner(
+    model,tokenizer,image_ori,batch_images,input_ids,attention_mask,image_start_index_list,image_nums,added_bbox_list,debug=False):
+    """Evaluate a model on COCO dataset.
+    Returns:
+        float: CIDEr score
+    """
+    visual_logits_processor = VisualLogitsProcessor(tokenizer)
+    model.eval()
+    # model.eval().cuda()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+    bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token = "<|#box#|>"
+    prebox_token = "<|#prebox#|>"
+    endofobject_token = "<|#endofobject#|>"
+    object_token = "<|#object#|>"
+    ori_prompt_length = len(input_ids[0])
+    have_prebox = False
+    while True:
+        batch_images = batch_images
+        input_ids = input_ids
+        attention_mask = attention_mask
+        image_start_index_list = image_start_index_list
+        image_nums = image_nums
+        if debug:
+            print("input--->",tokenizer.decode(input_ids[0]))
+        p1 = MinNewTokensLengthLogitsProcessor(
+            prompt_length_to_skip=input_ids.shape[-1],
+            min_new_tokens=5,
+            eos_token_id=bos_token_id,
+        )
+        with torch.inference_mode():
+            outputs = model.generate(
+                batch_images,
+                input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=20,
+                # min_new_tokens=8,
+                num_beams=1,
+                # length_penalty=0,
+                image_start_index_list=image_start_index_list,
+                image_nums=image_nums,
+                added_bbox_list=added_bbox_list if len(added_bbox_list) != 0 else None,
+                logits_processor_list=[p1, visual_logits_processor],
+            )
+        if debug:
+            print("outputs--->",tokenizer.decode(outputs[0]))
+        if outputs[0, -2] in [previsual_token_id, visual_token_id] and outputs[0, -1] == bos_token_id:
+            prompt = tokenizer.decode(outputs.clone()[0])
+            is_visual = (outputs[0, -2] == visual_token_id)
+            batch_text = tokenizer.batch_decode(outputs[:, :-1])
+            encodings = tokenizer(
+                batch_text,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=2000,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            if debug:
+                print("get the visual bbox--->",tokenizer.decode(input_ids[0]))
+            with torch.no_grad():
+                outputs = model(
+                    vision_x=batch_images,
+                    lang_x=input_ids,
+                    attention_mask=attention_mask,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=added_bbox_list if len(added_bbox_list) != 0 else None,
+                    add_box=added_bbox_list is not None and len(added_bbox_list) != 0,
+                )
+            boxes = outputs["boxes"]
+            scores = outputs["scores"]
+            # if not model.valid:
+            #     import pdb; pdb.set_trace()
+            if boxes is not None:
+                if is_visual:
+                    if have_prebox:
+                        added_bbox_list.pop()
+                        prompt = prompt.replace("<|#previsual#|><|#prebox#|><|#object#|>", "")
+                        have_prebox = False
+                        if debug:
+                            print("find previsual and remove it--->", prompt)
+                    first_box = boxes[scores.argmax()]
+                    added_bbox_list += [torch.tensor(first_box).unsqueeze(0) / 224]
+                    prompt = prompt[:-len(tokenizer.eos_token)]
+                    prompt += box_token + endofobject_token
+                    if debug:
+                        print("after inserting visual---->", prompt)
+                else:
+                    import numpy as np
+                    import cv2
+                    open_cv_image = np.array(image_ori)
+                    open_cv_image = open_cv_image[:, :, ::-1].copy()
+                    for i, pre_box in enumerate(boxes):
+                        open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 255, 0), i+1)
+                    out_image = Image.fromarray(cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB))
+                    # exit()
+                    pre_box = boxes[scores.argmax()]
+                    added_bbox_list += [torch.tensor(pre_box).unsqueeze(0).cuda() / 224]
+                    prompt = prompt[:-len(tokenizer.eos_token)]
+                    prompt += prebox_token + object_token
+                    have_prebox = True
+                    if debug:
+                        print("after inserting previsual---->", prompt)
+            else:
+                if debug:
+                    import pdb;pdb.set_trace()
+                prompt = tokenizer.decode(outputs[0, :-2].clone()[0])
+        else:
+            break
+    outputs = outputs[:, ori_prompt_length:]
+    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].replace('"', "")
+    # new_predictions = [
+    #     postprocess_captioning_generation(out).replace('"', "")
+    #     for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    # ]
+        # import pdb; pdb.set_trace()
+    return outputs, out_image
+def evaluate_coco_flickr(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size,
+    is_flickr=False,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    debug=False,
+):
+    """Evaluate a model on COCO dataset.
+    Returns:
+        float: CIDEr score
+    """
+    visual_logits_processor = VisualLogitsProcessor(tokenizer)
+    coco_dataset = load_dataset("coco_caption")
+    eval_dataset = coco_dataset["test"]
+    model.eval().cuda()
+    predictions = dict()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+    bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token = "<|#box#|>"
+    prebox_token = "<|#prebox#|>"
+    endofobject_token = "<|#endofobject#|>"
+    object_token = "<|#object#|>"
+    cnt = 0
+    if world_size > 1:
+        torch.distributed.barrier()
+    desc = "Running inference Flickr30" if is_flickr else "Running inference COCO"
+    for ii, batch in enumerate(more_itertools.chunked(
+        tqdm(eval_dataset, desc=desc, disable=(rank != 0)), batch_size
+    )):
+        if ii % world_size != rank:
+            continue
+        cnt += len(batch)
+        batch[0]["image"] = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/images/img3.jpg").resize((224, 224))
+        batch_images = prepare_batch_images(
+            batch=batch,
+            image_processor=image_processor,
+        ).cuda()
+        prompt = f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"
+        added_bbox_list = []
+        batch_text = [prompt for _ in batch]
+        encodings = tokenizer(
+            batch_text,
+            padding="longest",
+            truncation=True,
+            return_tensors="pt",
+            max_length=2000,
+        )
+        ori_prompt_length = len(encodings["input_ids"][0])
+        have_prebox = False
+        while True:
+            batch_text = [prompt for _ in batch]
+            encodings = tokenizer(
+                batch_text,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=2000,
+            )
+            input_ids = encodings["input_ids"].cuda()
+            attention_mask = encodings["attention_mask"].cuda()
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            if debug:
+                print("input--->",tokenizer.decode(input_ids[0]))
+            p1 = MinNewTokensLengthLogitsProcessor(
+                prompt_length_to_skip=input_ids.shape[-1],
+                min_new_tokens=5,
+                eos_token_id=bos_token_id,
+            )
+            with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+                outputs = model.generate(
+                    batch_images,
+                    input_ids,
+                    attention_mask=attention_mask,
+                    max_new_tokens=20,
+                    # min_new_tokens=8,
+                    num_beams=1,
+                    # length_penalty=0,
+                    image_start_index_list=image_start_index_list,
+                    image_nums=image_nums,
+                    added_bbox_list=added_bbox_list if len(added_bbox_list) != 0 else None,
+                    logits_processor_list=[p1, visual_logits_processor],
+                )
+            if debug:
+                print("outputs--->",tokenizer.decode(outputs[0]))
+            if outputs[0, -2] in [previsual_token_id, visual_token_id] and outputs[0, -1] == bos_token_id:
+                prompt = tokenizer.decode(outputs.clone()[0])
+                is_visual = (outputs[0, -2] == visual_token_id)
+                batch_text = tokenizer.batch_decode(outputs[:, :-1])
+                encodings = tokenizer(
+                    batch_text,
+                    padding="longest",
+                    truncation=True,
+                    return_tensors="pt",
+                    max_length=2000,
+                )
+                input_ids = encodings["input_ids"].cuda()
+                attention_mask = encodings["attention_mask"].cuda()
+                image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+                image_start_index_list = [[x] for x in image_start_index_list]
+                image_nums = [1] * len(input_ids)
+                if debug:
+                    print("get the visual bbox--->",tokenizer.decode(input_ids[0]))
+                with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                    outputs = model(
+                        vision_x=batch_images,
+                        lang_x=input_ids,
+                        attention_mask=attention_mask,
+                        image_nums=image_nums,
+                        image_start_index_list=image_start_index_list,
+                        added_bbox_list=added_bbox_list if len(added_bbox_list) != 0 else None,
+                        add_box=added_bbox_list is not None and len(added_bbox_list) != 0,
+                    )
+                boxes = outputs["boxes"]
+                scores = outputs["scores"]
+                # if not model.valid:
+                #     import pdb; pdb.set_trace()
+                if boxes is not None:
+                    if is_visual:
+                        if have_prebox:
+                            added_bbox_list.pop()
+                            prompt = prompt.replace("<|#previsual#|><|#prebox#|><|#object#|>", "")
+                            have_prebox = False
+                            if debug:
+                                print("find previsual and remove it--->", prompt)
+                        first_box = boxes[scores.argmax()]
+                        added_bbox_list += [torch.tensor(first_box).unsqueeze(0).cuda() / 224]
+                        prompt = prompt[:-len(tokenizer.eos_token)]
+                        prompt += box_token + endofobject_token
+                        if debug:
+                            print("after inserting visual---->", prompt)
+                    else:
+                        import numpy as np
+                        import cv2
+                        open_cv_image = np.array(batch[0]["image"])
+                        open_cv_image = open_cv_image[:, :, ::-1].copy()
+                        for i, pre_box in enumerate(boxes):
+                            open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 255, 0), i+1)
+                        cv2.imwrite("Atest.png", open_cv_image)
+                        exit()
+                        pre_box = boxes[scores.argmax()]
+                        added_bbox_list += [torch.tensor(pre_box).unsqueeze(0).cuda() / 224]
+                        prompt = prompt[:-len(tokenizer.eos_token)]
+                        prompt += prebox_token + object_token
+                        have_prebox = True
+                        if debug:
+                            print("after inserting previsual---->", prompt)
+                else:
+                    import pdb;pdb.set_trace()
+                    prompt = tokenizer.decode(outputs[0, :-2].clone()[0])
+            else:
+                break
+        outputs = outputs[:, ori_prompt_length:]
+        new_predictions = [
+            postprocess_captioning_generation(out).replace('"', "")
+            for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ]
+        # import pdb; pdb.set_trace()
+        if rank == 0:
+            tqdm.write(new_predictions[0])
+        for i, sample in enumerate(batch):
+            predictions[int(sample["image_id"])] = {
+                "caption": new_predictions[i],
+            }
+        print(new_predictions)
+        exit()
+    results_path = (
+        f"flickrresults_{lang_encoder_name}_{rank}_{id}.json"
+        if is_flickr
+        else f"cocoresults_{lang_encoder_name}_{rank}_{id}.json"
+    )
+    with open(results_path, "w") as f:
+        f.write(
+            json.dumps(
+                [
+                    {"image_id": k, "caption": predictions[k]["caption"]}
+                    for k in predictions
+                ],
+                indent=2,
+            )
+        )
+    print("save to", results_path)
+    del predictions
+    time.sleep(10)
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        predictions = []
+        for rank_i in range(world_size):
+            part_results_path = (
+                f"flickrresults_{lang_encoder_name}_{rank_i}_{id}.json"
+                if is_flickr
+                else f"cocoresults_{lang_encoder_name}_{rank_i}_{id}.json"
+            )
+            print("load", part_results_path)
+            predictions.extend(json.load(open(part_results_path)))
+            os.remove(part_results_path)
+        print("num:", len(predictions))
+        results_path = (
+            f"flickrresults_{lang_encoder_name}.json"
+            if is_flickr
+            else f"cocoresults_{lang_encoder_name}.json"
+        )
+        json.dump(predictions, open(results_path, "w"), indent=2)
+        metrics = compute_cider(
+            result_path=results_path,
+            annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/.cache/lavis/coco_gt/coco_karpathy_test_gt.json",
+        )
+        metrics["CIDEr"] *= 100
+        os.makedirs("eval_results", exist_ok=True)
+        acc = metrics["CIDEr"]
+        with open(os.path.join("eval_results", f"cococap_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
+            f.write(json.dumps(predictions, indent=2))
+        # delete the temporary file
+        os.remove(results_path)
+    else:
+        metrics = {}
+        metrics["CIDEr"] = 0.0
+    return metrics["CIDEr"]

multimodal/build/lib/open_flamingo/eval/task/cola.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import json
+import webdataset as wds
+from tqdm import tqdm
+from PIL import Image
+import torch
+import numpy as np
+import os
+import time
+import cv2
+import random
+import math
+from open_flamingo.eval.task.utils import (
+    get_object_from_text,
+    is_correct,
+    _eval_text_image,
+    get_bbox,
+    get_iou,
+)
+DATASET = "/gpfs/u/home/LMCG/LMCGljnn/scratch/code/COLA/data/COLA_multiobjects_matching_benchmark.json"
+VG_ROOT = "/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/vg/VG_100K"
+def get_score(image, text, model, tokenizer, image_processor, vis_embed_size):
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    object_token_id = tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1]
+    text = text.split("#")
+    obj_A = text[0].strip().split(" ")
+    relation = text[1].strip()
+    obj_B = text[2].strip().split(" ")
+    if "computer mouse" not in text[0].strip():
+        attrAs = obj_A[:-1]
+        nounA = obj_A[-1]
+    else:
+        attrAs = obj_A[:-2]
+        nounA = " ".join(obj_A[-2:])
+    if "computer mouse" not in text[2].strip():
+        attrBs = obj_B[:-1]
+        nounB = obj_B[-1]
+    else:
+        attrBs = obj_B[:-2]
+        nounB = " ".join(obj_B[-2:])
+    # print("="*80)
+    # print(attrAs, nounA)
+    # print(attrBs, nounB)
+    # print(relation)
+    # print("="*80)
+    batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+    prompt1 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|>the {nounA}<|#endofobject#|><|#visual#|>"]
+    boxes, scores = get_bbox(None, batch_images, prompt1, model, tokenizer, media_token_id, prebox_token_id, return_all=True)
+    # open_cv_image = np.array(image)
+    # open_cv_image = open_cv_image[:, :, ::-1].copy()
+    # for pre_box in boxes:
+    #     open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 255, 0), 2)
+    box_ppl = []
+    box_attr_losses = []
+    for box in boxes:
+        losses = []
+        for attrA in attrAs:
+            prompt2 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|><|#previsual#|><|#prebox#|><|#object#|> the {attrA} {nounA}"]
+            encodings = tokenizer(
+                prompt2,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=512,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            labels = lang_x.clone()
+            start_idx = (labels == object_token_id).nonzero()[-1, -1]
+            labels[0, :start_idx+1] = -100
+            added_bbox_list = [torch.tensor(box / 224.0).cuda().unsqueeze(0)]
+            with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=labels,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=added_bbox_list,
+                    add_box=added_bbox_list is not None,
+                    relations=None,
+                )
+            loss = outputs.loss
+            loss = (loss.sum() / (loss != 0).sum()).item()
+            losses.append(loss)
+        avg_ppl = np.array(losses).mean()
+        box_ppl.append(avg_ppl)
+        box_attr_losses.append(losses)
+    fit_idx = np.array(box_ppl).argmin()
+    fit_box = boxes[fit_idx]
+    fit_attr = attrAs[np.array(box_attr_losses[fit_idx]).argmin()]
+    first_ppl = min(box_ppl)
+    # open_cv_image = cv2.rectangle(open_cv_image, fit_box[:2].astype(int), fit_box[2:].astype(int), (255, 0, 0), 2)
+    prompt3 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|>the {fit_attr} {nounA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> is {relation}<|#object#|><|#previsual#|>"]
+    boxes, scores = get_bbox([torch.tensor(fit_box / 224).cuda().unsqueeze(0)], batch_images, prompt3, model, tokenizer, media_token_id, prebox_token_id, return_all=True)
+    # for i, pre_box in enumerate(boxes):
+    #     open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 0, 255), i+1)
+    # cv2.imwrite(f"Atest.png", open_cv_image)
+    box_ppl = []
+    for box in boxes:
+        losses = []
+        for attrB in attrBs:
+            prompt4 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|>the {fit_attr} {nounA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> is {relation}<|#object#|><|#previsual#|><|#prebox#|><|#object#|> the {attrB} {nounB}"]
+            encodings = tokenizer(
+                prompt4,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=512,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            labels = lang_x.clone()
+            start_idx = (labels == object_token_id).nonzero()[-1, -1]
+            labels[0, :start_idx+1] = -100
+            added_bbox_list = [torch.tensor(fit_box / 224.0).cuda().unsqueeze(0), torch.tensor(box / 224.0).cuda().unsqueeze(0)]
+            with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    labels=labels,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=added_bbox_list,
+                    add_box=added_bbox_list is not None,
+                    relations=None,
+                )
+            loss = outputs.loss
+            loss = (loss.sum() / (loss != 0).sum()).item()
+            losses.append(loss)
+        avg_ppl = np.array(losses).mean()
+        box_ppl.append(avg_ppl)
+    second_ppl = (np.array(box_ppl) * np.array(scores)).sum() / sum(scores)
+    return (first_ppl + second_ppl) / 2
+def evaluate_cola(
+    model,
+    tokenizer,
+    image_processor,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    debug=False,
+):
+    dataset_name = "cola"
+    dataset = json.load(open(DATASET))
+    model = model.cuda().eval()
+    correct = 0
+    total = 0
+    pbar = tqdm(dataset, disable=(rank != 0))
+    for ii, sample in enumerate(pbar):
+        if ii % world_size != rank:
+            continue
+        image1 = Image.open(os.path.join(VG_ROOT, os.path.basename(sample[0]))).convert("RGB").resize((224, 224))
+        text1 = sample[1]
+        image2 = Image.open(os.path.join(VG_ROOT, os.path.basename(sample[2]))).convert("RGB").resize((224, 224))
+        text2 = sample[3]
+        score11 = -get_score(image1, text1, model, tokenizer, image_processor, vis_embed_size)
+        score12 = -get_score(image1, text2, model, tokenizer, image_processor, vis_embed_size)
+        score21 = -get_score(image2, text1, model, tokenizer, image_processor, vis_embed_size)
+        score22 = -get_score(image2, text2, model, tokenizer, image_processor, vis_embed_size)
+        if rank == 0:
+            tqdm.write(f"{score11:.2f} {score12:.2f} {score21:.2f} {score22:.2f}")
+        if score11 > score21 and score22 > score12:
+            correct += 1
+        total += 1
+        pbar.set_description(f"{correct / total:.2f}")
+    print(rank, correct / total)
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([total, correct]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        total = 0
+        correct = 0
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [total_part, correct_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            total += total_part
+            correct += correct_part
+        score = correct / total
+        print("score:", score)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{score}_{total}"), "w") as f:
+            pass
+    else:
+        score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+if __name__ == "__main__":
+    evaluate_cola(None, None, None)

multimodal/build/lib/open_flamingo/eval/task/crepe.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import json
+import webdataset as wds
+from tqdm import tqdm
+from PIL import Image
+import torch
+import numpy as np
+import os
+import time
+import cv2
+import random
+import pandas as pd
+from .vl_checklist import _eval_text_image
+DATASET_ROOT = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/cdl/instruct_data/crepe/prod_hard_negatives"
+def evaluate_crepe(
+    model,
+    tokenizer,
+    image_processor,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    subset=True,
+    debug=False,
+    level=4,
+    type="swap",
+):
+    if rank == 0:
+        tqdm.write(f"level: {level}")
+        tqdm.write(f"type: {type}")
+    dataset_name = "crepe"
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+    endofattr_token_id = tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    model.eval().cuda()
+    total = 0
+    correct = 0
+    assert type in ["swap"]
+    assert 4 <= level <= 12
+    filename = os.path.join(DATASET_ROOT, type, f"prod_vg_hard_negs_{type}_complexity_{level}.csv")
+    df = pd.read_csv(filename)
+    pbar = tqdm(df.iterrows(), disable=(rank != 0))
+    for ii, sample in pbar:
+        if ii % world_size != rank:
+            continue
+        text = sample.caption
+        image_path = "/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/vg/VG_100K/{}.jpg".format(sample.image_id)
+        x = sample.x
+        y = sample.y
+        width = sample.width
+        height = sample.height
+        image = Image.open(image_path).convert("RGB")
+        image = image.crop((x, y, x+width, y+height))
+        image = image.resize((224, 224))
+        final_rank, final_ranks = _eval_text_image(text, image, model, tokenizer, image_processor, vis_embed_size, media_token_id, prebox_token_id, debug=debug)
+        if final_rank is None:
+            continue
+        correct += int((np.array(final_ranks) < 10).sum())
+        total += len(final_ranks)
+        if debug:
+            tqdm.write("="*80)
+        pbar.set_description(f"{text} | score: {correct / total:.4f} | {final_rank} | {final_ranks}")
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([total, correct]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        total = 0
+        correct = 0
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [total_part, correct_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            total += total_part
+            correct += correct_part
+        score = correct / total
+        print("score:", score, "total:", total)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{score}"), "w") as f:
+            pass
+    else:
+        score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score

multimodal/build/lib/open_flamingo/eval/task/gqa.py ADDED Viewed

	@@ -0,0 +1,248 @@

+from torch.utils.data import Dataset
+import json
+from PIL import Image
+import os
+import torch
+import more_itertools
+from tqdm import tqdm
+import time
+from vqa_metric import compute_gqa_accuracy
+import string
+import uuid
+import numpy as np
+import cv2
+from open_flamingo.eval.task.utils import get_bbox
+class GQADataset(Dataset):
+    def __init__(
+        self,
+        image_dir_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/gqa/images",
+        annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/gqa/testdev_balanced_questions.json",
+    ):
+        annotations = json.load(open(annotations_path))
+        self.questions = []
+        self.answers = []
+        self.image_paths = []
+        self.question_ids = []
+        for anno_id in annotations:
+            question = annotations[anno_id]["question"]
+            imageId = annotations[anno_id]["imageId"]
+            answer = annotations[anno_id]["answer"]
+            self.questions.append(question)
+            self.answers.append(answer)
+            self.image_paths.append(os.path.join(image_dir_path, "{}.jpg".format(imageId)))
+            self.question_ids.append(anno_id)
+            # print(annotations[anno_id]["types"])
+        self.vqa_dataset = "gqa"
+    def __len__(self):
+        return len(self.questions)
+    def __getitem__(self, idx):
+        question = self.questions[idx]
+        question_id = self.question_ids[idx]
+        answer = self.answers[idx]
+        img_path = self.image_paths[idx]
+        image = Image.open(img_path)
+        return {
+            "image": image,
+            "question": question,
+            "answers": answer,
+            "question_id": question_id,
+        }
+def prepare_batch_images(batch, image_processor):
+    batch_images = None
+    for b in batch:
+        b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        if batch_images is None:
+            batch_images = b_image
+        else:
+            batch_images = torch.cat([batch_images, b_image], dim=0)
+    return batch_images
+def evaluate_gqa(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size=1,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    """
+    Evaluate a model on VQA datasets. Currently supports VQA v2.0.
+    Args:
+        model (nn.Module): model to evaluate
+        tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model
+        image_processor : image processor for the model
+        batch_size (int): batch size
+        image_dir_path (str): path to image directory
+        questions_json_path (str): path to questions json file
+        annotations_json_path (str): path to annotations json file
+        seed (int, optional): random seed. Defaults to 42.
+        max_generation_length (int, optional): max generation length. Defaults to 5.
+        num_beams (int, optional): number of beams to use for beam search. Defaults to 3.
+        length_penalty (float, optional): length penalty for beam search. Defaults to -2.0.
+        num_samples (int, optional): number of samples to evaluate on. Defaults to 5000 samples.
+        query_set_size (int, optional): size of the query set. Defaults to 2048.
+        num_shots (int, optional): number of shots to use. Defaults to 8.
+        device (int, optional): device to use. Defaults to -1 (cpu).
+        num_workers (int, optional): number of workers to use. Defaults to 4.
+        vqa_dataset (string): type of vqa dataset: currently supports vqa, ok_vqa. Defaults to vqa.
+    Returns:
+        float: accuracy score
+    """
+    assert batch_size == 1
+    vqa_dataset = "gqa"
+    eval_dataset = GQADataset()
+    object_token_id = tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1]
+    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+    bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    def get_prompt(sample):
+        return f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:"
+    model.eval().cuda()
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    predictions = []
+    if batch_size != 1:
+        tokenizer.padding_side = "left"
+    if world_size > 1:
+        torch.distributed.barrier()
+    this_tot = 0
+    for ii, batch in enumerate(more_itertools.chunked(
+        tqdm(eval_dataset, desc="Running inference", disable=(rank != 0)), batch_size,
+    )):
+        if ii % world_size != rank:
+            continue
+        batch[0]["image"] = batch[0]["image"].resize((224, 224))
+        batch_images = prepare_batch_images(
+            batch=batch,
+            image_processor=image_processor,
+        ).cuda()
+        batch_text = [get_prompt(s) for s in batch]
+        encodings = tokenizer(
+            batch_text,
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            max_length=2000,
+        )
+        input_ids = encodings["input_ids"].cuda()
+        attention_mask = encodings["attention_mask"].cuda()
+        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+        image_start_index_list = [[x] for x in image_start_index_list]
+        image_nums = [1] * len(input_ids)
+        with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+            outputs = model.generate(
+                batch_images,
+                input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=10,
+                min_length=1,
+                num_beams=1,
+                # length_penalty=0,
+                image_start_index_list=image_start_index_list,
+                image_nums=image_nums,
+                added_bbox_list=None,
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+        scores = outputs.scores
+        outputs = outputs.sequences[:, len(input_ids[0]) :]
+        if object_token_id in scores[0][0].sort(descending=True).indices[:5]:
+            sample = batch[0]
+            # print("="*80)
+            # print("sample:", batch, scores[0][0].sort(descending=True).indices[:10].tolist().index(object_token_id))
+            prompt1 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:<|#object#|><|#previsual#|>"]
+            boxes, scores = get_bbox(None, batch_images, prompt1, model, tokenizer, media_token_id, prebox_token_id, return_all=True)
+            # open_cv_image = np.array(sample["image"])
+            # open_cv_image = open_cv_image[:, :, ::-1].copy()
+            # cv2.imwrite(f"Atest_ori.png", open_cv_image)
+            # open_cv_image = cv2.rectangle(open_cv_image, boxes[0][:2].astype(int), boxes[0][2:].astype(int), (0, 255, 0), 2)
+            # print(scores)
+            # cv2.imwrite(f"Atest.png", open_cv_image)
+            if boxes is not None and len(boxes) > 0:
+                prompt2 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer: it is<|#object#|><|#previsual#|><|#prebox#|><|#object#|> a"]
+                encodings = tokenizer(
+                    prompt2,
+                    return_tensors="pt",
+                    padding="longest",
+                    truncation=True,
+                    max_length=2000,
+                )
+                input_ids = encodings["input_ids"].cuda()
+                attention_mask = encodings["attention_mask"].cuda()
+                image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+                image_start_index_list = [[x] for x in image_start_index_list]
+                image_nums = [1] * len(input_ids)
+                added_bbox_list = [torch.tensor(boxes[0]/224.0).cuda().unsqueeze(0).clamp(0, 0.99)]
+                with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+                    outputs = model.generate(
+                        batch_images,
+                        input_ids,
+                        attention_mask=attention_mask,
+                        max_new_tokens=10,
+                        min_length=1,
+                        num_beams=1,
+                        image_start_index_list=image_start_index_list,
+                        image_nums=image_nums,
+                        added_bbox_list=added_bbox_list,
+                        eos_token_id=(endofobject_token_id),
+                    )
+                outputs = outputs[:, len(input_ids[0]) :]
+                # print("previsual===>{}".format(tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower().strip(string.punctuation+" ")))
+        # postprocess begin
+        new_predictions = [
+            out.strip().lower().strip(string.punctuation+" ") for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ]
+        this_tot += 1
+        predictions.extend(
+            [
+                {"answer": p, "question_id": sample["question_id"], "_question": sample["question"], "answers": sample["answers"]}
+                for p, sample in zip(new_predictions, batch)
+            ]
+        )
+    with open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps(predictions))
+    print("save to", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json")
+    time.sleep(10)
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        predictions = []
+        for rank_i in range(world_size):
+            print("load", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
+            predictions.extend(json.load(open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")))
+            os.remove(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
+        print("num:", len(predictions))
+        # save the predictions to a temporary file
+        random_uuid = str(uuid.uuid4())
+        with open(f"{vqa_dataset}results_{random_uuid}.json", "w") as f:
+            f.write(json.dumps(predictions, indent=4))
+        acc = compute_gqa_accuracy(predictions)
+        print(vqa_dataset, "score:", acc, "| save to", f"{vqa_dataset}results_{random_uuid}.json")
+        os.makedirs("eval_results", exist_ok=True)
+        with open(os.path.join("eval_results", f"{vqa_dataset}_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
+            f.write(json.dumps(predictions, indent=2))
+        # delete the temporary file
+        os.remove(f"{vqa_dataset}results_{random_uuid}.json")
+    else:
+        time.sleep(5)
+        acc = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return acc

multimodal/build/lib/open_flamingo/eval/task/mmbench.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import base64
+import io
+import random
+import pandas as pd
+from PIL import Image
+from torch.utils.data import Dataset
+from open_flamingo.eval.task.utils import get_object_from_text
+def decode_base64_to_image(base64_string):
+    image_data = base64.b64decode(base64_string)
+    image = Image.open(io.BytesIO(image_data))
+    return image
+class MMBenchDataset(Dataset):
+    def __init__(self,
+                 data_file,
+                 sys_prompt='There are several options:'):
+        self.df = pd.read_csv(data_file, sep='\t')
+        self.sys_prompt = sys_prompt
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, idx):
+        index = self.df.iloc[idx]['index']
+        image = self.df.iloc[idx]['image']
+        image = decode_base64_to_image(image)
+        question = self.df.iloc[idx]['question']
+        answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[0].keys() else None
+        catetory = self.df.iloc[idx]['category']
+        l2_catetory = self.df.iloc[idx]['l2-category']
+        option_candidate = ['A', 'B', 'C', 'D', 'E']
+        options = {
+            cand: self.load_from_df(idx, cand)
+            for cand in option_candidate
+            if self.load_from_df(idx, cand) is not None
+        }
+        options_prompt = f'{self.sys_prompt}\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = self.load_from_df(idx, 'hint')
+        data = {
+            'img': image,
+            'question': question,
+            'answer': answer,
+            'options': options_prompt,
+            'category': catetory,
+            'l2-category': l2_catetory,
+            'options_dict': options,
+            'index': index,
+            'context': hint,
+        }
+        return data
+    def load_from_df(self, idx, key):
+        if key in self.df.iloc[idx] and not pd.isna(self.df.iloc[idx][key]):
+            return self.df.iloc[idx][key]
+        else:
+            return None
+def evaluate_mmbench(
+    model,
+    tokenizer,
+    image_processor,
+    batch_size=1,
+    image_dir_path=None,
+    questions_json_path=None,
+    annotations_json_path=None,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    dataset_name = "mmbench"
+    dataset = MMBenchDataset("/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/mmbench/mmbench_dev_20230712.tsv")
+    for sample in dataset:
+        print(sample)
+if __name__ == '__main__':
+    evaluate_mmbench(None, None, None)

multimodal/build/lib/open_flamingo/eval/task/reg.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import torch
+from tqdm import tqdm
+from PIL import Image
+from io import BytesIO
+import base64
+import numpy as np
+import time
+import json
+import os
+import cv2
+from coco_metric import compute_cider
+import random
+import pickle
+def evaluate_reg(
+    model,
+    tokenizer,
+    image_processor,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+):
+    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
+    dataset_name = "refcocog"
+    pkl_file = "/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/open_flamingo/eval/task/others/refcocog_reg_val_data.pkl"
+    try:
+        media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+        endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+        pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
+        bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
+    except:
+        pass
+    model.eval().cuda()
+    if world_size > 1:
+        torch.distributed.barrier()
+    this_tot = 0
+    predictions = []
+    D = pickle.load(open(pkl_file, "rb"))
+    lines = []
+    data = D["data"]
+    uniq_id_to_text = D["uniq_id_to_text"]
+    uniq_id_to_image = D["uniq_id_to_image"]
+    uniq_id_to_image_id = D["uniq_id_to_image_id"]
+    for image_id in data:
+        for region in data[image_id]:
+            uniq_id = data[image_id][region][0]
+            lines.append([uniq_id, uniq_id_to_image_id[uniq_id], [uniq_id_to_text[r] for r in data[image_id][region]], region, uniq_id_to_image[uniq_id]])
+    print("total data:", len(lines))
+    # lines = lines[:20]
+    pbar = tqdm(lines, disable=(rank != 0))
+    for ii, line in enumerate(pbar):
+        if ii % world_size != rank:
+            continue
+        uniq_id, image_id, text, region_coord, image = line
+        gt_box = np.array(region_coord)
+        width = image.width
+        height = image.height
+        image = image.resize((224, 224))
+        gt_box = gt_box / np.array([width, height, width, height]) * 224
+        batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+        prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|><|#previsual#|><|#prebox#|><|#object#|>"]
+        encodings = tokenizer(
+            prompt,
+            padding="longest",
+            truncation=True,
+            return_tensors="pt",
+            max_length=2000,
+        )
+        input_ids = encodings["input_ids"]
+        attention_mask = encodings["attention_mask"]
+        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+        image_start_index_list = [[x] for x in image_start_index_list]
+        image_nums = [1] * len(input_ids)
+        batch_images = batch_images.cuda()
+        input_ids = input_ids.cuda()
+        attention_mask = attention_mask.cuda()
+        added_bbox_list = [(torch.tensor(gt_box).cuda() / 224).clamp(0, 0.99).unsqueeze(0)]
+        with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+            outputs = model.generate(
+                batch_images,
+                input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=25,
+                min_length=5,
+                num_beams=8,
+                length_penalty=0,
+                image_start_index_list=image_start_index_list,
+                image_nums=image_nums,
+                added_bbox_list=added_bbox_list,
+            )
+        outputs = outputs[:, len(input_ids[0]) :]
+        new_prediction = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip().lower()
+        this_tot += 1
+        if rank == 0 and this_tot % 10 == 0:
+            for i in range(1):
+                tqdm.write(f"answer: {text}\nmodel output: {new_prediction}")
+        predictions.append(
+            {"image_id": image_id, "caption": new_prediction}
+        )
+    results_path = f"reg_{lang_encoder_name}_{rank}_{id}.json"
+    json.dump(predictions, open(results_path, "w"))
+    print("save to", results_path)
+    del predictions
+    time.sleep(5)
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        predictions = []
+        for rank_i in range(world_size):
+            part_results_path = f"reg_{lang_encoder_name}_{rank_i}_{id}.json"
+            print("load", part_results_path)
+            part_data = json.load(open(part_results_path))
+            predictions.extend(part_data)
+            os.remove(part_results_path)
+        print("num:", len(predictions))
+        results_path = f"reg_{lang_encoder_name}_{id}_result.json"
+        json.dump(predictions, open(results_path, "w"), indent=2)
+        metrics = compute_cider(
+            result_path=results_path,
+            annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/open_flamingo/eval/task/others/refcocog_reg_val_label.json",
+        )
+        os.makedirs("eval_results", exist_ok=True)
+        cider = metrics["CIDEr"]
+        print("cider", cider)
+        with open(os.path.join("eval_results", f"reg_{model.expr_name}_{model.step_num}_{int(time.time())}_{cider}"), "w") as f:
+            f.write(json.dumps(predictions, indent=2))
+        # delete the temporary file
+        os.remove(results_path)
+        return cider
+if __name__ == "__main__":
+    anno = json.load(open("/gpfs/u/home/LMCG/LMCGljnn/scratch/.cache/lavis/coco_gt/coco_karpathy_test_gt.json"))
+    import pdb; pdb.set_trace()
+    print(anno.keys())

multimodal/build/lib/open_flamingo/eval/task/utils.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import spacy
+import torch
+from tqdm import tqdm
+import numpy as np
+import itertools
+nlp = spacy.load('en_core_web_md')
+def get_iou(box1, box2):
+    # box1 and box2 should be in the format [x1, y1, x2, y2]
+    intersection = max(0, min(box1[2], box2[2]) - max(box1[0], box2[0])) * \
+                   max(0, min(box1[3], box2[3]) - max(box1[1], box2[1]))
+    area_box1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    area_box2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union = area_box1 + area_box2 - intersection
+    iou = intersection / union if union > 0 else 0
+    return iou
+# def find_root(token):
+#     if token.pos_ == "VERB":
+#         return token
+#     while token.dep_ not in ["pobj", "nsubj", "ROOT", "npadvmod", "dobj", "det", "prep", "punct", "cc", "conj", "acl", "dep", "appos", "relcl", "advmod", "nmod", "attr"]:
+#         token = token.head
+#     return token
+def find_root(token):
+    if token.pos_ == "VERB":
+        return token
+    while token.dep_ in ["compound", "amod"]:
+        token = token.head
+    return token
+def get_object_from_text(text, verbose=False):
+    if len(text.split(" ")) == 3:
+        text = text.split(" ")
+        return [text[0], text[-1]]
+    doc = nlp(text)
+    if verbose:
+        for TT in doc:
+            print(TT.text, TT.pos_, TT.dep_, TT.head)
+    roots = set()
+    for i, token in enumerate(doc):
+        roots.add(find_root(token))
+    exprs = []
+    roots = sorted(list(roots), key=lambda token: token.idx)
+    first_nsubj = True
+    if verbose:
+        print(roots)
+    for root in roots:
+        if root.pos_ not in ["NOUN", "PROPN"]:
+            continue
+        if root.dep_ not in ["pobj", "nsubj"]:
+            continue
+        if not first_nsubj and root.dep_ in ["nsubj"]:
+            continue
+        exprs.append([])
+        for token in doc:
+            if find_root(token) == root:
+                exprs[-1].append(token.text)
+        exprs[-1] = " ".join(exprs[-1]).replace(" '", "'")
+        if exprs[-1] not in text:
+            if verbose:
+                print("not in text error:", exprs[-1], "#",text)
+            # for TT in doc:
+            #     print(TT.text, TT.pos_, TT.dep_, TT.head)
+            # import pdb; pdb.set_trace()
+            exprs.pop()
+        if first_nsubj and root.dep_ in ["nsubj"]:
+            first_nsubj = False
+    if len(exprs) <= 1:
+        if verbose:
+            print("not enough exprs error:", exprs, "#",text)
+        return []
+    return exprs
+def is_correct(input_ids, logits, tokenizer, object: str, topk=5, N=10):
+    answer_id = torch.tensor(tokenizer(f" {object}", add_special_tokens=False)["input_ids"]).to(input_ids.device)
+    answer_begin_idx = (input_ids == answer_id[0]).nonzero()
+    answer_idx = None
+    for (batch_idx, IDX) in answer_begin_idx:
+        try:
+            if (input_ids[batch_idx, IDX:IDX+len(answer_id)] == answer_id).all():
+                answer_idx = list(range(IDX-1, IDX+len(answer_id)-1))
+        except:
+            pass
+    if answer_idx is None:
+        return np.inf, False, False
+    res = logits[0, answer_idx].softmax(-1).sort(descending=True)
+    values = res.values
+    indices = res.indices
+    chosen_ids = list(itertools.product(*([list(range(N))]*len(answer_idx))))
+    probs = []
+    for ids in chosen_ids:
+        prob = 1.0
+        for i, id in enumerate(ids):
+            prob *= values[i, id]
+        probs.append((prob.item(), ids))
+    probs.sort(reverse=True)
+    answer_pos = tuple([id_array.tolist().index(idx) for id_array, idx in zip(indices, answer_id)])
+    ranking = [p[1] for p in probs]
+    # if len(answer_idx) > 1:
+    #     import pdb; pdb.set_trace()
+    try:
+        r = ranking.index(answer_pos)
+        return r, r < 1, r < 5
+    except:
+        return np.inf, False, False
+def get_bbox(visual_box_list, batch_images, prompt, model, tokenizer, media_token_id, prebox_token_id, debug=False, return_all=False):
+    assert isinstance(prompt, list) and len(prompt) == 1 and isinstance(prompt[0], str)
+    encodings = tokenizer(
+        prompt,
+        padding="longest",
+        truncation=True,
+        return_tensors="pt",
+        max_length=2000,
+    )
+    input_ids = encodings["input_ids"]
+    attention_mask = encodings["attention_mask"]
+    image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+    image_start_index_list = [[x] for x in image_start_index_list]
+    image_nums = [1] * len(input_ids)
+    vision_x = batch_images.cuda()
+    lang_x = input_ids.cuda()
+    attention_mask = attention_mask.cuda()
+    model.debug_id = 0
+    with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
+        outputs = model(
+            vision_x=vision_x,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            labels=None,
+            image_nums=image_nums,
+            image_start_index_list=image_start_index_list,
+            added_bbox_list=visual_box_list,
+            add_box=visual_box_list is not None,
+            relations=None,
+            debug_mode=False,
+        )
+    boxes = outputs["boxes"]
+    scores = outputs["scores"]
+    if debug:
+        import pdb; pdb.set_trace()
+    if return_all:
+        return boxes, scores
+    if len(scores) == 0:
+        return None, None
+    else:
+        return boxes[scores.argmax()], scores.max()
+def _eval_text_image(text, image, model, tokenizer, image_processor, vis_embed_size, media_token_id, prebox_token_id, debug=False, objects=None):
+    batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+    if objects is None:
+        objects = get_object_from_text(text)
+    if len(objects) == 0:
+        return None, None, None
+    if debug:
+        tqdm.write(text)
+        tqdm.write(f"{objects}")
+    first_idx = text.find(objects[0])
+    if first_idx == 0:
+        first_text = f"<|#object#|>{objects[0]}<|#endofobject#|><|#visual#|>"
+    else:
+        first_text = text[:first_idx-1] + f"<|#object#|> {objects[0]}<|#endofobject#|><|#visual#|>"
+    if debug:
+        tqdm.write(first_text)
+    prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{first_text}"]
+    # import pdb; pdb.set_trace()
+    # print("do first get_bbox |", first_text)
+    first_box, first_score = get_bbox(None, batch_images, prompt, model, tokenizer, media_token_id, prebox_token_id, return_all=False)
+    if not model.valid and debug:
+        import pdb; pdb.set_trace()
+    if first_box is not None:
+        added_bbox_list = [torch.tensor(first_box).unsqueeze(0).cuda() / 224]
+        text = first_text + "<|#box#|><|#endofobject#|>" + text[first_idx+len(objects[0]):]
+    else:
+        added_bbox_list = []
+    final_ranks = []
+    is_top1_list = []
+    is_top5_list = []
+    for kk, object in enumerate(objects):
+        if kk == 0:
+            continue
+        idx = text.find(objects[0])
+        for t_i, temp in enumerate(objects[1:kk+1]):
+            # t_i is actually the previous one. This is not a bug
+            idx = text.find(temp, idx + len(objects[t_i]))
+            while idx+len(temp) != len(text) and (text[idx-1] == "#" or text[idx+len(temp)] == "#"):
+                # in case temp is box or object or visual or something like that
+                idx = text.find(temp, idx + len(temp))
+        this_text = text[:idx-1] + "<|#object#|><|#previsual#|>"
+        # if this_text == "<|#object#|><|#previsual#|>":
+        #     import pdb; pdb.set_trace()
+        if debug:
+            tqdm.write(this_text)
+        prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{this_text}"]
+        # import pdb; pdb.set_trace()
+        # print("do pre get_bbox |", this_text)
+        pre_boxes, pre_scores = get_bbox(added_bbox_list, batch_images, prompt, model, tokenizer, media_token_id,
+        prebox_token_id, return_all=True)
+        if not model.valid and debug:
+            import pdb; pdb.set_trace()
+        logits_list = []
+        # pre_boxes = [pre_boxes[0]]
+        # pre_scores = [pre_scores[0]]
+        this_text = this_text + f"<|#prebox#|><|#object#|> {object}<|#endofobject#|>"
+        for pre_box, pre_score in zip(pre_boxes, pre_scores):
+            prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{this_text}"]
+            encodings = tokenizer(
+                prompt,
+                padding="longest",
+                truncation=True,
+                return_tensors="pt",
+                max_length=512,
+            )
+            input_ids = encodings["input_ids"]
+            attention_mask = encodings["attention_mask"]
+            image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
+            image_start_index_list = [[x] for x in image_start_index_list]
+            image_nums = [1] * len(input_ids)
+            vision_x = batch_images.cuda()
+            lang_x = input_ids.cuda()
+            attention_mask = attention_mask.cuda()
+            this_added_bbox_list = added_bbox_list + [torch.tensor(pre_box).unsqueeze(0).cuda() / 224]
+            with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad():
+                outputs = model(
+                    vision_x=vision_x,
+                    lang_x=lang_x,
+                    attention_mask=attention_mask,
+                    image_nums=image_nums,
+                    image_start_index_list=image_start_index_list,
+                    added_bbox_list=this_added_bbox_list,
+                    add_box=this_added_bbox_list is not None and len(this_added_bbox_list) != 0,
+                    relations=None,
+                )
+            if not model.valid and debug:
+                import pdb; pdb.set_trace()
+            logits_list.append([pre_score, outputs.logits])
+            if debug:
+                answer_start_idx = (lang_x == tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1]).nonzero()[-1][1]
+                logits = outputs["logits"][0, answer_start_idx:]
+                tqdm.write(tokenizer.decode(logits[0].sort(descending=True).indices.tolist()[:10]))
+            # if debug:
+            #     image.save("Atest.png")
+            #     open_cv_image = np.array(image)
+            #     open_cv_image = open_cv_image[:, :, ::-1].copy()
+            #     if first_box is not None:
+            #         open_cv_image = cv2.rectangle(open_cv_image, first_box[:2].astype(int), first_box[2:].astype(int), (255, 0, 0), 2)
+            #     if pre_box is not None:
+            #         open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 255, 0), 2)
+            #     cv2.imwrite(f"Atest.png", open_cv_image)
+            #     import pdb; pdb.set_trace()
+        pre_scores = np.array([x[0] for x in logits_list])
+        final_probs = 0.0
+        for score, (_, logits) in zip(pre_scores, logits_list):
+            final_probs += score * logits.softmax(-1)
+        assert input_ids.shape[:2] == final_probs.shape[:2]
+        _rank, is_top1, is_top5 = is_correct(input_ids, final_probs, tokenizer, object, topk=5)
+        final_ranks.append(_rank)
+        is_top1_list.append(is_top1)
+        is_top5_list.append(is_top5)
+        this_text = text[:idx-1] + f"<|#object#|> {object}<|#endofobject#|><|#visual#|>"
+        if debug:
+            tqdm.write(this_text)
+        prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>{this_text}"]
+        # print("do this get_bbox |", this_text)
+        this_box, this_score = get_bbox(added_bbox_list, batch_images, prompt, model, tokenizer, media_token_id, prebox_token_id, return_all=False)
+        if not model.valid and debug:
+            import pdb; pdb.set_trace()
+        if this_box is not None:
+            added_bbox_list += [torch.tensor(this_box).unsqueeze(0).cuda() / 224]
+            text = this_text + "<|#box#|><|#endofobject#|>" + text[idx+len(object):]
+    return final_ranks, is_top1_list, is_top5_list
+if __name__ == "__main__":
+    # print(get_object_from_text("there is a cookie. there is a bear. white orio cookie is next to the teddy bear. car runs on the traffic road. there is a tree.", verbose=False))
+    print(get_object_from_text("President speaks to an American at a business office",verbose=True))

multimodal/build/lib/open_flamingo/eval/task/vl_checklist.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import json
+import webdataset as wds
+from tqdm import tqdm
+from PIL import Image
+import torch
+import numpy as np
+import os
+import time
+import cv2
+import random
+from open_flamingo.eval.task.utils import (
+    get_object_from_text,
+    is_correct,
+    _eval_text_image,
+)
+DATASET_ROOT = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/cdl/instruct_data/instruct/vl_checklist/Relation/000000.tar"
+def evaluate_vlc(
+    model,
+    tokenizer,
+    image_processor,
+    vis_embed_size=None,
+    rank=0,
+    world_size=1,
+    id=0,
+    subset=True,
+    subset_size="5k",
+    debug=False,
+):
+    dataset_name = "vlc"
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    box_token_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+    endofattr_token_id = tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+    model.eval().cuda()
+    total = 0
+    n_top1 = 0
+    n_top5 = 0
+    n_top10 = 0
+    filename = "/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/open_flamingo/eval/task/vlc_data.json" if not subset else f"/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/open_flamingo/eval/task/vlc_data_subset_{subset_size}.json"
+    dataset = json.load(open(filename))
+    pbar = tqdm(dataset, disable=(rank != 0))
+    for ii, sample in enumerate(pbar):
+        if ii % world_size != rank:
+            continue
+        text, image_path = sample
+        image = Image.open(image_path).convert("RGB")
+        image = image.resize((224, 224))
+        final_ranks, is_top1_list, is_top5_list = _eval_text_image(text, image, model, tokenizer, image_processor, vis_embed_size, media_token_id, prebox_token_id, debug=debug)
+        if final_ranks is None:
+            continue
+        n_top1 += int(sum(is_top1_list))
+        n_top5 += int(sum(is_top5_list))
+        n_top10 += int((np.array(final_ranks) < 10).sum())
+        total += len(final_ranks)
+        if debug:
+            tqdm.write("="*80)
+        pbar.set_description(f"acc@top1: {n_top1 / total:.4f} | acc@top5: {n_top5 / total:.4f} | acc@top10: {n_top10 / total:.4f} | {final_ranks} |{text}")
+    with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f:
+        f.write(json.dumps([total, n_top1, n_top5, n_top10]))
+    if world_size > 1:
+        torch.distributed.barrier()
+    if rank == 0:
+        total = 0
+        n_top1 = 0
+        n_top5 = 0
+        n_top10 = 0
+        print(f"evaluate on rank {rank}. world size is {world_size}")
+        for rank_i in range(world_size):
+            [total_part, n_top1_part, n_top5_part, n_top10_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json"))
+            os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json")
+            total += total_part
+            n_top1 += n_top1_part
+            n_top5 += n_top5_part
+            n_top10 += n_top10_part
+        print("acc@top1:", n_top1 / total, "acc@top5:", n_top5 / total, "acc@top10:", n_top10 / total, "total:", total)
+        with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{n_top1 / total}_{n_top5 / total}_{n_top10 / total}_{total}"), "w") as f:
+            pass
+    else:
+        score = 0.0
+    if world_size > 1:
+        torch.distributed.barrier()
+    return score
+if __name__ == "__main__":
+    dataset = wds.WebDataset(DATASET_ROOT).decode().shuffle(100000).to_tuple("data.pyd", "dataset.txt", "image_path.txt")
+    labels = set()
+    texts = []
+    data_pair = []
+    if not os.path.exists("vlc_data.json"):
+        for sample in tqdm(dataset):
+            data, dataset_name, image_path = sample
+            text = data[-1]["POS"][0]
+            texts.append(text)
+            data_pair.append([text, image_path])
+        json.dump(data_pair, open("vlc_data.json", "w"), indent=1)
+    else:
+        print("data exists")
+        data_pair = json.load(open("vlc_data.json"))
+        for text, image_path in data_pair:
+            texts.append(text)
+    print(get_object_from_text("crow attacks the dove"))

multimodal/build/lib/open_flamingo/eval/vqa_metric.py ADDED Viewed

	@@ -0,0 +1,594 @@

+import copy
+import datetime
+import json
+import os
+import random
+import re
+import sys
+# Interface for accessing the VQA dataset.
+# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
+# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
+# The following functions are defined:
+#  VQA        - VQA class that loads VQA annotation file and prepares data structures.
+#  getQuesIds - Get question ids that satisfy given filter conditions.
+#  getImgIds  - Get image ids that satisfy given filter conditions.
+#  loadQA     - Load questions and answers with the specified question ids.
+#  showQA     - Display the specified questions and answers.
+#  loadRes    - Load result file and create result object.
+# Help on each function can be accessed by: "help(COCO.function)"
+class VQA:
+    def __init__(self, annotation_file=None, question_file=None):
+        """
+        Constructor of VQA helper class for reading and visualizing questions and answers.
+        :param annotation_file (str): location of VQA annotation file
+        :return:
+        """
+        # load dataset
+        self.dataset = {}
+        self.questions = {}
+        self.qa = {}
+        self.qqa = {}
+        self.imgToQA = {}
+        if not annotation_file == None and not question_file == None:
+            print("loading VQA annotations and questions into memory...")
+            time_t = datetime.datetime.utcnow()
+            dataset = json.load(open(annotation_file, "r"))
+            questions = json.load(open(question_file, "r"))
+            print(datetime.datetime.utcnow() - time_t)
+            self.dataset = dataset
+            self.questions = questions
+            self.createIndex()
+    def createIndex(self):
+        # create index
+        print("creating index...")
+        imgToQA = {ann["image_id"]: [] for ann in self.dataset["annotations"]}
+        qa = {ann["question_id"]: [] for ann in self.dataset["annotations"]}
+        qqa = {ann["question_id"]: [] for ann in self.dataset["annotations"]}
+        for ann in self.dataset["annotations"]:
+            imgToQA[ann["image_id"]] += [ann]
+            qa[ann["question_id"]] = ann
+        for ques in self.questions["questions"]:
+            qqa[ques["question_id"]] = ques
+        print("index created!")
+        # create class members
+        self.qa = qa
+        self.qqa = qqa
+        self.imgToQA = imgToQA
+    def info(self):
+        """
+        Print information about the VQA annotation file.
+        :return:
+        """
+        for key, value in self.dataset["info"].items():
+            print("%s: %s" % (key, value))
+    def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
+        """
+        Get question ids that satisfy given filter conditions. default skips that filter
+        :param 	imgIds    (int array)   : get question ids for given imgs
+                        quesTypes (str array)   : get question ids for given question types
+                        ansTypes  (str array)   : get question ids for given answer types
+        :return:    ids   (int array)   : integer array of question ids
+        """
+        imgIds = imgIds if type(imgIds) == list else [imgIds]
+        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
+        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
+        if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
+            anns = self.dataset["annotations"]
+        else:
+            if not len(imgIds) == 0:
+                anns = sum(
+                    [self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA],
+                    [],
+                )
+            else:
+                anns = self.dataset["annotations"]
+            anns = (
+                anns
+                if len(quesTypes) == 0
+                else [ann for ann in anns if ann["question_type"] in quesTypes]
+            )
+            anns = (
+                anns
+                if len(ansTypes) == 0
+                else [ann for ann in anns if ann["answer_type"] in ansTypes]
+            )
+        ids = [ann["question_id"] for ann in anns]
+        return ids
+    def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
+        """
+         Get image ids that satisfy given filter conditions. default skips that filter
+         :param quesIds   (int array)   : get image ids for given question ids
+        quesTypes (str array)   : get image ids for given question types
+        ansTypes  (str array)   : get image ids for given answer types
+         :return: ids     (int array)   : integer array of image ids
+        """
+        quesIds = quesIds if type(quesIds) == list else [quesIds]
+        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
+        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
+        if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
+            anns = self.dataset["annotations"]
+        else:
+            if not len(quesIds) == 0:
+                anns = sum(
+                    [self.qa[quesId] for quesId in quesIds if quesId in self.qa], []
+                )
+            else:
+                anns = self.dataset["annotations"]
+            anns = (
+                anns
+                if len(quesTypes) == 0
+                else [ann for ann in anns if ann["question_type"] in quesTypes]
+            )
+            anns = (
+                anns
+                if len(ansTypes) == 0
+                else [ann for ann in anns if ann["answer_type"] in ansTypes]
+            )
+        ids = [ann["image_id"] for ann in anns]
+        return ids
+    def loadQA(self, ids=[]):
+        """
+        Load questions and answers with the specified question ids.
+        :param ids (int array)       : integer ids specifying question ids
+        :return: qa (object array)   : loaded qa objects
+        """
+        if type(ids) == list:
+            return [self.qa[id] for id in ids]
+        elif type(ids) == int:
+            return [self.qa[ids]]
+    def showQA(self, anns):
+        """
+        Display the specified annotations.
+        :param anns (array of object): annotations to display
+        :return: None
+        """
+        if len(anns) == 0:
+            return 0
+        for ann in anns:
+            quesId = ann["question_id"]
+            print("Question: %s" % (self.qqa[quesId]["question"]))
+            for ans in ann["answers"]:
+                print("Answer %d: %s" % (ans["answer_id"], ans["answer"]))
+    def loadRes(self, resFile, quesFile):
+        """
+        Load result file and return a result object.
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = VQA()
+        res.questions = json.load(open(quesFile))
+        res.dataset["info"] = copy.deepcopy(self.questions["info"])
+        res.dataset["task_type"] = copy.deepcopy(self.questions["task_type"])
+        res.dataset["data_type"] = copy.deepcopy(self.questions["data_type"])
+        res.dataset["data_subtype"] = copy.deepcopy(self.questions["data_subtype"])
+        res.dataset["license"] = copy.deepcopy(self.questions["license"])
+        print("Loading and preparing results...     ")
+        time_t = datetime.datetime.utcnow()
+        anns = json.load(open(resFile))
+        assert type(anns) == list, "results is not an array of objects"
+        annsQuesIds = [ann["question_id"] for ann in anns]
+        # print set of question ids that do not have corresponding annotations
+        # assert set(annsQuesIds) == set(self.getQuesIds()), \
+        # 'Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file.'
+        for ann in anns:
+            quesId = ann["question_id"]
+            if res.dataset["task_type"] == "Multiple Choice":
+                assert (
+                    ann["answer"] in self.qqa[quesId]["multiple_choices"]
+                ), "predicted answer is not one of the multiple choices"
+            qaAnn = self.qa[quesId]
+            ann["image_id"] = qaAnn["image_id"]
+            ann["question_type"] = qaAnn["question_type"]
+            ann["answer_type"] = qaAnn["answer_type"]
+        print(
+            "DONE (t=%0.2fs)" % ((datetime.datetime.utcnow() - time_t).total_seconds())
+        )
+        res.dataset["annotations"] = anns
+        res.createIndex()
+        return res
+class VQAEval:
+    def __init__(self, vqa=None, vqaRes=None, n=2):
+        self.n = n
+        self.accuracy = {}
+        self.evalQA = {}
+        self.evalQuesType = {}
+        self.evalAnsType = {}
+        self.vqa = vqa
+        self.vqaRes = vqaRes
+        if vqaRes is not None:
+            self.params = {"question_id": vqaRes.getQuesIds()}
+        self.contractions = {
+            "aint": "ain't",
+            "arent": "aren't",
+            "cant": "can't",
+            "couldve": "could've",
+            "couldnt": "couldn't",
+            "couldn'tve": "couldn't've",
+            "couldnt've": "couldn't've",
+            "didnt": "didn't",
+            "doesnt": "doesn't",
+            "dont": "don't",
+            "hadnt": "hadn't",
+            "hadnt've": "hadn't've",
+            "hadn'tve": "hadn't've",
+            "hasnt": "hasn't",
+            "havent": "haven't",
+            "hed": "he'd",
+            "hed've": "he'd've",
+            "he'dve": "he'd've",
+            "hes": "he's",
+            "howd": "how'd",
+            "howll": "how'll",
+            "hows": "how's",
+            "Id've": "I'd've",
+            "I'dve": "I'd've",
+            "Im": "I'm",
+            "Ive": "I've",
+            "isnt": "isn't",
+            "itd": "it'd",
+            "itd've": "it'd've",
+            "it'dve": "it'd've",
+            "itll": "it'll",
+            "let's": "let's",
+            "maam": "ma'am",
+            "mightnt": "mightn't",
+            "mightnt've": "mightn't've",
+            "mightn'tve": "mightn't've",
+            "mightve": "might've",
+            "mustnt": "mustn't",
+            "mustve": "must've",
+            "neednt": "needn't",
+            "notve": "not've",
+            "oclock": "o'clock",
+            "oughtnt": "oughtn't",
+            "ow's'at": "'ow's'at",
+            "'ows'at": "'ow's'at",
+            "'ow'sat": "'ow's'at",
+            "shant": "shan't",
+            "shed've": "she'd've",
+            "she'dve": "she'd've",
+            "she's": "she's",
+            "shouldve": "should've",
+            "shouldnt": "shouldn't",
+            "shouldnt've": "shouldn't've",
+            "shouldn'tve": "shouldn't've",
+            "somebody'd": "somebodyd",
+            "somebodyd've": "somebody'd've",
+            "somebody'dve": "somebody'd've",
+            "somebodyll": "somebody'll",
+            "somebodys": "somebody's",
+            "someoned": "someone'd",
+            "someoned've": "someone'd've",
+            "someone'dve": "someone'd've",
+            "someonell": "someone'll",
+            "someones": "someone's",
+            "somethingd": "something'd",
+            "somethingd've": "something'd've",
+            "something'dve": "something'd've",
+            "somethingll": "something'll",
+            "thats": "that's",
+            "thered": "there'd",
+            "thered've": "there'd've",
+            "there'dve": "there'd've",
+            "therere": "there're",
+            "theres": "there's",
+            "theyd": "they'd",
+            "theyd've": "they'd've",
+            "they'dve": "they'd've",
+            "theyll": "they'll",
+            "theyre": "they're",
+            "theyve": "they've",
+            "twas": "'twas",
+            "wasnt": "wasn't",
+            "wed've": "we'd've",
+            "we'dve": "we'd've",
+            "weve": "we've",
+            "werent": "weren't",
+            "whatll": "what'll",
+            "whatre": "what're",
+            "whats": "what's",
+            "whatve": "what've",
+            "whens": "when's",
+            "whered": "where'd",
+            "wheres": "where's",
+            "whereve": "where've",
+            "whod": "who'd",
+            "whod've": "who'd've",
+            "who'dve": "who'd've",
+            "wholl": "who'll",
+            "whos": "who's",
+            "whove": "who've",
+            "whyll": "why'll",
+            "whyre": "why're",
+            "whys": "why's",
+            "wont": "won't",
+            "wouldve": "would've",
+            "wouldnt": "wouldn't",
+            "wouldnt've": "wouldn't've",
+            "wouldn'tve": "wouldn't've",
+            "yall": "y'all",
+            "yall'll": "y'all'll",
+            "y'allll": "y'all'll",
+            "yall'd've": "y'all'd've",
+            "y'alld've": "y'all'd've",
+            "y'all'dve": "y'all'd've",
+            "youd": "you'd",
+            "youd've": "you'd've",
+            "you'dve": "you'd've",
+            "youll": "you'll",
+            "youre": "you're",
+            "youve": "you've",
+        }
+        self.manualMap = {
+            "none": "0",
+            "zero": "0",
+            "one": "1",
+            "two": "2",
+            "three": "3",
+            "four": "4",
+            "five": "5",
+            "six": "6",
+            "seven": "7",
+            "eight": "8",
+            "nine": "9",
+            "ten": "10",
+        }
+        self.articles = ["a", "an", "the"]
+        self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
+        self.commaStrip = re.compile("(\d)(\,)(\d)")
+        self.punct = [
+            ";",
+            r"/",
+            "[",
+            "]",
+            '"',
+            "{",
+            "}",
+            "(",
+            ")",
+            "=",
+            "+",
+            "\\",
+            "_",
+            "-",
+            ">",
+            "<",
+            "@",
+            "`",
+            ",",
+            "?",
+            "!",
+        ]
+    def evaluate(self, quesIds=None):
+        if quesIds == None:
+            quesIds = [quesId for quesId in self.params["question_id"]]
+        gts = {}
+        res = {}
+        for quesId in quesIds:
+            gts[quesId] = self.vqa.qa[quesId]
+            res[quesId] = self.vqaRes.qa[quesId]
+        # =================================================
+        # Compute accuracy
+        # =================================================
+        accQA = []
+        accQuesType = {}
+        accAnsType = {}
+        print("computing accuracy")
+        step = 0
+        for quesId in quesIds:
+            for ansDic in gts[quesId]["answers"]:
+                ansDic["answer"] = ansDic["answer"].replace("\n", " ")
+                ansDic["answer"] = ansDic["answer"].replace("\t", " ")
+                ansDic["answer"] = ansDic["answer"].strip()
+            resAns = res[quesId]["answer"]
+            resAns = resAns.replace("\n", " ")
+            resAns = resAns.replace("\t", " ")
+            resAns = resAns.strip()
+            gtAcc = []
+            gtAnswers = [ans["answer"] for ans in gts[quesId]["answers"]]
+            if len(set(gtAnswers)) > 1:
+                for ansDic in gts[quesId]["answers"]:
+                    ansDic["answer"] = self.processPunctuation(ansDic["answer"])
+                    ansDic["answer"] = self.processDigitArticle(ansDic["answer"])
+                resAns = self.processPunctuation(resAns)
+                resAns = self.processDigitArticle(resAns)
+            for gtAnsDatum in gts[quesId]["answers"]:
+                otherGTAns = [
+                    item for item in gts[quesId]["answers"] if item != gtAnsDatum
+                ]
+                matchingAns = [item for item in otherGTAns if item["answer"] == resAns]
+                acc = min(1, float(len(matchingAns)) / 3)
+                gtAcc.append(acc)
+            quesType = gts[quesId]["question_type"]
+            ansType = gts[quesId]["answer_type"]
+            avgGTAcc = float(sum(gtAcc)) / len(gtAcc)
+            accQA.append(avgGTAcc)
+            if quesType not in accQuesType:
+                accQuesType[quesType] = []
+            accQuesType[quesType].append(avgGTAcc)
+            if ansType not in accAnsType:
+                accAnsType[ansType] = []
+            accAnsType[ansType].append(avgGTAcc)
+            self.setEvalQA(quesId, avgGTAcc)
+            self.setEvalQuesType(quesId, quesType, avgGTAcc)
+            self.setEvalAnsType(quesId, ansType, avgGTAcc)
+            if step % 100 == 0:
+                self.updateProgress(step / float(len(quesIds)))
+            step = step + 1
+        self.setAccuracy(accQA, accQuesType, accAnsType)
+        print("Done computing accuracy")
+    def processPunctuation(self, inText):
+        outText = inText
+        for p in self.punct:
+            if (p + " " in inText or " " + p in inText) or (
+                re.search(self.commaStrip, inText) != None
+            ):
+                outText = outText.replace(p, "")
+            else:
+                outText = outText.replace(p, " ")
+        outText = self.periodStrip.sub("", outText, re.UNICODE)
+        return outText
+    def processDigitArticle(self, inText):
+        outText = []
+        tempText = inText.lower().split()
+        for word in tempText:
+            word = self.manualMap.setdefault(word, word)
+            if word not in self.articles:
+                outText.append(word)
+            else:
+                pass
+        for wordId, word in enumerate(outText):
+            if word in self.contractions:
+                outText[wordId] = self.contractions[word]
+        outText = " ".join(outText)
+        return outText
+    def setAccuracy(self, accQA, accQuesType, accAnsType):
+        self.accuracy["overall"] = round(100 * float(sum(accQA)) / len(accQA), self.n)
+        self.accuracy["perQuestionType"] = {
+            quesType: round(
+                100 * float(sum(accQuesType[quesType])) / len(accQuesType[quesType]),
+                self.n,
+            )
+            for quesType in accQuesType
+        }
+        self.accuracy["perAnswerType"] = {
+            ansType: round(
+                100 * float(sum(accAnsType[ansType])) / len(accAnsType[ansType]), self.n
+            )
+            for ansType in accAnsType
+        }
+    def setEvalQA(self, quesId, acc):
+        self.evalQA[quesId] = round(100 * acc, self.n)
+    def setEvalQuesType(self, quesId, quesType, acc):
+        if quesType not in self.evalQuesType:
+            self.evalQuesType[quesType] = {}
+        self.evalQuesType[quesType][quesId] = round(100 * acc, self.n)
+    def setEvalAnsType(self, quesId, ansType, acc):
+        if ansType not in self.evalAnsType:
+            self.evalAnsType[ansType] = {}
+        self.evalAnsType[ansType][quesId] = round(100 * acc, self.n)
+    def updateProgress(self, progress):
+        barLength = 20
+        status = ""
+        if isinstance(progress, int):
+            progress = float(progress)
+        if not isinstance(progress, float):
+            progress = 0
+            status = "error: progress var must be float\r\n"
+        if progress < 0:
+            progress = 0
+            status = "Halt...\r\n"
+        if progress >= 1:
+            progress = 1
+            status = "Done...\r\n"
+        block = int(round(barLength * progress))
+        text = "\rFinshed Percent: [{0}] {1}% {2}".format(
+            "#" * block + "-" * (barLength - block), int(progress * 100), status
+        )
+        sys.stdout.write(text)
+        sys.stdout.flush()
+def compute_vqa_accuracy(result_json_path, question_json_path, annotation_json_path, vqa_dataset):
+    """Compute the VQA accuracy metric.
+    Args:
+        predictions (List): list of predictions
+        ground_truth (List[List]): list of all possible ground truth answers
+    Returns:
+        float: VQA accuracy
+    """
+    # coding: utf-8
+    # dataDir = data_dir
+    # set up file names and paths
+    # versionType = 'v2_'  # this should be '' when using VQA v2.0 dataset
+    # 'OpenEnded' only for v2.0. 'OpenEnded' or 'MultipleChoice' for v1.0
+    # taskType = 'OpenEnded'
+    # 'mscoco' only for v1.0. 'mscoco' for real and 'abstract_v002' for abstract for v1.0.
+    # dataType = 'mscoco'
+    # dataSubType = 'train2014'
+    # annFile = '%s/%s%s_%s_annotations.json' % (
+    # dataDir, versionType, dataType, dataSubType)
+    # quesFile = '%s/%s%s_%s_%s_questions.json' % (
+    # dataDir, versionType, taskType, dataType, dataSubType)
+    # imgDir = '%s/%s/%s/' % (dataDir, dataType, dataSubType)
+    # resultType = res_file_name
+    # fileTypes = ['results', 'accuracy',
+    #              'evalQA', 'evalQuesType', 'evalAnsType']
+    # An example result json file has been provided in './Results' folder.
+    # [resFile, accuracyFile, evalQAFile, evalQuesTypeFile, evalAnsTypeFile] = ['%s/%s%s_%s_%s_%s_%s.json' % (dataDir, versionType, taskType, dataType, dataSubType,
+    # resultType, fileType) for fileType in fileTypes]
+    # create vqa object and vqaRes object
+    vqa = VQA(annotation_json_path, question_json_path)
+    vqaRes = vqa.loadRes(result_json_path, question_json_path)
+    # create vqaEval object by taking vqa and vqaRes
+    # n is precision of accuracy (number of places after decimal), default is 2
+    vqaEval = VQAEval(vqa, vqaRes, n=2)
+    # evaluate results
+    """
+    If you have a list of question ids on which you would like to evaluate your results, pass it as a list to below function
+    By default it uses all the question ids in annotation file
+    """
+    vqaEval.evaluate()
+    return vqaEval.accuracy["overall"]
+def postprocess_vqa_generation(predictions):
+    return re.split("Question|Answer", predictions, 1)[0]
+def compute_gqa_accuracy(results):
+    acc = []
+    vqa_tool = VQAEval()
+    for res in results:
+        gt_ans = res["answers"]
+        pred = res["answer"]
+        pred = vqa_tool.processPunctuation(pred)
+        pred = vqa_tool.processDigitArticle(pred)
+        vqa_acc = 1 if pred == gt_ans else 0
+        acc.append(vqa_acc)
+    accuracy = sum(acc) / len(acc)
+    return accuracy

multimodal/build/lib/open_flamingo/src/__init__.py ADDED Viewed

File without changes

multimodal/build/lib/open_flamingo/src/attention.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import numpy as np
+import torch
+from torch import nn
+from torch.nn import init
+class SEAttention(nn.Module):
+    def __init__(self, channel=512,reduction=16):
+        super().__init__()
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.GELU(),
+            nn.Linear(channel // reduction, channel, bias=False),
+            nn.GELU(),
+            nn.Linear(channel, 1, bias=False),
+            nn.Sigmoid()
+        )
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.001)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.fc(x)
+        return x
+if __name__ == '__main__':
+    input=torch.randn(50,512,7,7)
+    se = SEAttention(channel=512,reduction=8)
+    output=se(input)
+    print(output.shape)

multimodal/build/lib/open_flamingo/src/factory.py ADDED Viewed

	@@ -0,0 +1,269 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+import open_clip
+import torch
+from .flamingo import Flamingo
+from .flamingo_lm import FlamingoLMMixin
+from .utils import extend_instance
+import logging
+import random
+import time
+def create_model_and_transforms(
+    clip_vision_encoder_path: str,
+    clip_vision_encoder_pretrained: str,
+    lang_encoder_path: str,
+    tokenizer_path: str,
+    use_local_files: bool = False,
+    decoder_layers_attr_name: str = None,
+    location_token_num: int = 1000,
+    checkpoint_activations: bool = False,
+    freeze_vision_encoder: bool = False,
+    lora: bool = False,
+    lora_r: int = 16,
+    fix_ffn: bool = False,
+    add_visual_token: bool = False,
+    add_box: bool = False,
+    add_pe: bool = False,
+    add_relation: bool = False,
+    use_format_v2: bool = False,
+    use_sam: str = None,
+    enhance_data: bool = False,
+    roi_align: bool = False,
+    roi_output_size: int = 4,
+    apply_mask: bool = False,
+    **flamingo_kwargs,
+):
+    """
+    Initialize a Flamingo model from a pretrained vision encoder and language encoder.
+    Appends special tokens to the tokenizer and freezes backbones.
+    Args:
+        clip_vision_encoder_path (str): path to pretrained clip model (e.g. "ViT-B-32")
+        clip_vision_encoder_pretrained (str): name of pretraining dataset for clip model (e.g. "laion2b_s32b_b79k")
+        lang_encoder_path (str): path to pretrained language encoder
+        tokenizer_path (str): path to pretrained tokenizer
+        cross_attn_every_n_layers (int, optional): determines how often to add a cross-attention layer. Defaults to 1.
+        use_local_files (bool, optional): whether to use local files. Defaults to False.
+        decoder_layers_attr_name (str, optional): name of the decoder layers attribute. Defaults to None.
+    Returns:
+        Flamingo: Flamingo model from pretrained vision and language encoders
+        Image processor: Pipeline to preprocess input images
+        Tokenizer: A tokenizer for the language model
+    """
+    if use_sam is None:
+        no_success = True
+        while no_success:
+            try:
+                vision_encoder, _, image_processor = open_clip.create_model_and_transforms(
+                    clip_vision_encoder_path, pretrained=clip_vision_encoder_pretrained
+                )
+                no_success = False
+            except:
+                logging.info("retry creating vision_encoder")
+                time.sleep(random.random() * 5)
+        # set the vision encoder to output the visual features
+        vision_encoder.visual.output_tokens = True
+        # delete text encoder part
+        del vision_encoder.transformer
+        del vision_encoder.text_projection
+        del vision_encoder.token_embedding
+        del vision_encoder.ln_final
+        del vision_encoder.positional_embedding
+        del vision_encoder.logit_scale
+        vision_encoder.visual.proj = None
+        vision_encoder.visual.ln_post = torch.nn.Identity()
+    else:
+        from segment_anything import SamPredictor, sam_model_registry
+        assert use_sam == "vit_l"
+        sam = sam_model_registry[use_sam](checkpoint="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/checkpoint/sam_vit_l_0b3195_256x256.pth")
+        del sam.prompt_encoder
+        del sam.mask_decoder
+        sam.image_encoder.neck = torch.nn.Identity()
+        vision_encoder = sam.image_encoder
+        from open_clip.transform import image_transform
+        image_processor = image_transform(
+            256,
+            is_train=False,
+            mean=(0.48145466, 0.4578275, 0.40821073),
+            std=(0.26862954, 0.26130258, 0.27577711),
+        )
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path, local_files_only=use_local_files
+    )
+    # add Flamingo special tokens to the tokenizer
+    additional_special_tokens = ["<|#image#|>", "<|#endofimage#|>"]
+    if add_visual_token:
+        additional_special_tokens += ["<|#visual#|>", "<|#object#|>"]
+    if add_box:
+        additional_special_tokens += ["<|#box#|>", "<|#endofobject#|>", "<|#attr#|>", "<|#endofattr#|>"]
+    if use_format_v2:
+        additional_special_tokens += ["<|#previsual#|>", "<|#prebox#|>"]
+    if enhance_data:
+        additional_special_tokens += ["<|#NOTHING#|>"]
+    text_tokenizer.add_special_tokens(
+        {"additional_special_tokens": additional_special_tokens}
+    )
+    if text_tokenizer.pad_token is None:
+        # Issue: GPT models don't have a pad token, which we use to
+        # modify labels for the loss.
+        text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+    lang_encoder = AutoModelForCausalLM.from_pretrained(
+        lang_encoder_path, local_files_only=use_local_files
+    )
+    extend_instance(lang_encoder, FlamingoLMMixin)
+    if decoder_layers_attr_name is None:
+        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
+    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
+    lang_encoder.resize_token_embeddings(len(text_tokenizer))
+    lang_encoder_name = lang_encoder.__class__.__name__.lower()
+    if checkpoint_activations:
+        from fairscale.nn.checkpoint import checkpoint_wrapper
+        if use_sam is None:
+            for i in range(len(vision_encoder.visual.transformer.resblocks)):
+                vision_encoder.visual.transformer.resblocks[i] = checkpoint_wrapper(
+                    vision_encoder.visual.transformer.resblocks[i],
+                    offload_to_cpu=False,
+                )
+        else:
+            for i in range(len(vision_encoder.blocks)):
+                vision_encoder.blocks[i] = checkpoint_wrapper(
+                    vision_encoder.blocks[i],
+                    offload_to_cpu=False,
+                )
+        if "opt" in lang_encoder_name:
+            for i in range(len(lang_encoder.model.decoder.layers)):
+                lang_encoder.model.decoder.layers[i] = checkpoint_wrapper(
+                    lang_encoder.model.decoder.layers[i],
+                    offload_to_cpu=False,
+                )
+        elif "codegen" in lang_encoder_name:
+            for i in range(len(lang_encoder.transformer.h)):
+                lang_encoder.transformer.h[i] = checkpoint_wrapper(
+                    lang_encoder.transformer.h[i],
+                    offload_to_cpu=False,
+                )
+        elif "llama" in lang_encoder_name:
+            for i in range(len(lang_encoder.model.layers)):
+                lang_encoder.model.layers[i] = checkpoint_wrapper(
+                    lang_encoder.model.layers[i],
+                    offload_to_cpu=False,
+                )
+        elif "gptneo" in lang_encoder_name:
+            for i in range(len(lang_encoder.gpt_neox.layers)):
+                lang_encoder.gpt_neox.layers[i] = checkpoint_wrapper(
+                    lang_encoder.gpt_neox.layers[i],
+                    offload_to_cpu=False,
+                )
+        else:
+            raise ValueError(f"unknown model {lang_encoder_name}")
+    if use_sam is None:
+        vis_dim = open_clip.get_model_config(clip_vision_encoder_path)["vision_cfg"]["width"]
+        image_size = open_clip.get_model_config(clip_vision_encoder_path)["vision_cfg"]["image_size"]
+        patch_size = open_clip.get_model_config(clip_vision_encoder_path)["vision_cfg"]["patch_size"]
+    else:
+        # SAM config
+        vis_dim = 1024
+        image_size = 256
+        patch_size = 16
+    assert image_size % patch_size == 0
+    vis_embed_size = (image_size // patch_size) ** 2
+    if lora:
+        from peft import LoraConfig, TaskType
+        from peft import get_peft_model
+        if "codegen" in lang_encoder_name:
+            lang_target_modules = ["qkv_proj", "out_proj", "fc_in", "fc_out"]
+        elif "opt" in lang_encoder_name:
+            lang_target_modules = ["k_proj", "v_proj", "q_proj", "out_proj"]
+        elif "llama" in lang_encoder_name:
+            lang_target_modules = ["k_proj", "v_proj", "q_proj", "o_proj", "gate_proj", "down_proj", "up_proj"]
+        else:
+            raise NotImplementedError
+        lang_peft_config = LoraConfig(
+            task_type="CAUSAL_LM",
+            r=16, lora_alpha=16,
+            target_modules=lang_target_modules,
+            lora_dropout=0.05, bias="none",
+        )
+        lang_encoder = get_peft_model(lang_encoder, lang_peft_config)
+        lang_encoder.print_trainable_parameters()
+    if fix_ffn:
+        if "opt" in lang_encoder_name:
+            for i in range(len(lang_encoder.model.decoder.layers)):
+                lang_encoder.model.decoder.layers[i].requires_grad_(False)
+                lang_encoder.model.decoder.layers[i].self_attn.requires_grad_(True)
+        else:
+            raise NotImplementedError
+    lang_dim = int(lang_encoder.config.hidden_size) if not lora else int(lang_encoder.base_model.model.config.hidden_size)
+    if hasattr(lang_encoder.config, "word_embed_proj_dim"):
+        hidden_state_dim = lang_encoder.config.word_embed_proj_dim
+    else:
+        hidden_state_dim = lang_encoder.config.hidden_size
+    model = Flamingo(
+        vision_encoder=vision_encoder,
+        lang_encoder=lang_encoder,
+        eoc_token_id=text_tokenizer.encode(text_tokenizer.eos_token)[-1],
+        media_token_id=text_tokenizer.encode("<|#image#|>")[-1],
+        image_end_token_id=text_tokenizer.encode("<|#endofimage#|>")[-1],
+        visual_token_id=text_tokenizer.encode("<|#visual#|>")[-1] if add_visual_token else None,
+        previsual_token_id=text_tokenizer.encode("<|#previsual#|>")[-1] if add_visual_token else None,
+        box_token_id=text_tokenizer.encode("<|#box#|>")[-1] if add_box else None,
+        prebox_token_id=text_tokenizer.encode("<|#prebox#|>")[-1] if add_box else None,
+        nothing_token_id=text_tokenizer.encode("<|#NOTHING#|>")[-1] if enhance_data else None,
+        endofobject_token_id=text_tokenizer.encode("<|#endofobject#|>")[-1],
+        vis_dim=vis_dim,
+        vis_embed_size=vis_embed_size,
+        lang_dim=lang_dim,
+        image_size=image_size,
+        patch_size=patch_size,
+        hidden_state_dim=hidden_state_dim,
+        add_visual_token=add_visual_token,
+        add_pe=add_pe,
+        add_relation=add_relation,
+        use_format_v2=use_format_v2,
+        roi_align=roi_align,
+        roi_output_size=roi_output_size,
+        apply_mask=apply_mask,
+        **flamingo_kwargs,
+    )
+    if freeze_vision_encoder:
+        print("freeze vision encoder")
+        model.vision_encoder.requires_grad_(False)
+    print(
+        f"Flamingo model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters"
+    )
+    return model, image_processor, text_tokenizer, vis_embed_size
+def _infer_decoder_layers_attr_name(model):
+    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
+        if k.lower() in model.__class__.__name__.lower():
+            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
+    raise ValueError(
+        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
+    )
+__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
+    "opt": "model.decoder.layers",
+    # "gptneo": "transformer.h",
+    "gptj": "transformer.h",
+    "gpt-j": "transformer.h",
+    "pythia": "gpt_neox.layers",
+    "gptneox": "gpt_neox.layers",
+    "llama": "model.layers",
+    "llamaforcausallm": "model.layers",
+    "gpt2": "transformer.h",
+    "codegen": "transformer.h",
+}

multimodal/build/lib/open_flamingo/src/flamingo.py ADDED Viewed

	@@ -0,0 +1,637 @@

+import torch
+import torchvision
+from einops import rearrange
+from torch import nn
+from yolox.models.yolo_head import YOLOXHead
+from yolox.utils.boxes import xyxy2cxcywh, cxcywh2xyxy
+from yolox.utils.demo_utils import nms
+# import matplotlib.pyplot as plt
+# import seaborn as sns
+import numpy as np
+import logging
+from open_flamingo.src.gcn import GCN
+from transformers import LogitsProcessorList
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s %(message)s',
+    datefmt='%m/%d %I:%M:%S',
+)
+# class PositionEncodingModule(nn.Module):
+#     def __init__(self, dim, pos_dim=128):
+#         super().__init__()
+#         self.encode = nn.Sequential(
+#             nn.Linear(5, pos_dim // 2),
+#             nn.BatchNorm1d(pos_dim // 2),
+#             nn.GELU(),
+#             nn.Linear(pos_dim // 2, pos_dim),
+#             nn.BatchNorm1d(pos_dim),
+#             nn.GELU(),
+#         )
+#         self.merge = nn.Sequential(
+#             nn.Linear(dim + pos_dim, dim),
+#             nn.BatchNorm1d(dim),
+#             nn.GELU(),
+#         )
+#     def forward(self, x, box):
+#         box = self.encode(box)
+#         x = torch.cat([x, box], dim=-1)
+#         x = self.merge(x)
+#         return x
+# class PositionEncodingModule(nn.Module):
+#     def __init__(self, dim):
+#         super().__init__()
+#         self.encode = nn.Sequential(
+#             nn.Linear(5, dim),
+#             nn.GELU(),
+#         )
+#     def forward(self, x, box):
+#         box = self.encode(box)
+#         x = x + box
+#         return x
+# class PositionEncodingModule2(nn.Module):
+#     def __init__(self, dim):
+#         super().__init__()
+#         self.encode = nn.Sequential(
+#             nn.Linear(5 + dim, dim),
+#             nn.ELU(),
+#         )
+#     def forward(self, x, box):
+#         x = torch.cat([x, box], dim=-1)
+#         x = self.encode(x)
+#         return x
+# class RelationHead(nn.Module):
+#     def __init__(self, dim):
+#         super().__init__()
+#         self.encode = nn.Sequential(
+#             nn.LayerNorm(dim),
+#             nn.Linear(dim, 128),
+#             nn.ELU(),
+#         )
+#         self.classifier = nn.Linear(256, 51)
+#     def forward(self, x1, x2):
+#         x1 = self.encode(x1)
+#         x2 = self.encode(x2)
+#         x = torch.cat([x1, x2], dim=-1)
+#         x = self.classifier(x)
+#         return x
+class Flamingo(nn.Module):
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        lang_encoder: nn.Module,
+        eoc_token_id: int,
+        media_token_id: int,
+        image_end_token_id: int,
+        visual_token_id: int,
+        previsual_token_id: int,
+        box_token_id: int,
+        prebox_token_id: int,
+        nothing_token_id: int,
+        endofobject_token_id: int,
+        vis_dim: int,
+        vis_embed_size: int,
+        lang_dim: int,
+        hidden_state_dim: int,
+        image_size: int,
+        patch_size: int,
+        use_media_placement_augmentation: bool = False,
+        add_visual_token: bool = False,
+        add_pe: bool = False,
+        add_relation: bool = False,
+        use_format_v2: bool = False,
+        roi_align: bool = False,
+        roi_output_size: int = 4,
+        apply_mask: bool = False,
+    ):
+        """
+        Args:
+            vision_encoder (nn.Module): HF CLIPModel
+            lang_encoder (nn.Module): HF causal language model
+            eoc_token_id (int): Token id for eos token
+            media_token_id (int): Token id for <|#image#|>
+            vis_dim (int): Dimension of the visual features.
+                Visual features are projected to match this shape along the last dimension.
+            cross_attn_every_n_layers (int, optional): How often to apply cross attention after transformer layer. Defaults to 1.
+            use_media_placement_augmentation (bool, optional): Whether to randomly assign images to the preceding or following text in training. Defaults to False.
+        """
+        super().__init__()
+        self.image_end_token_id = image_end_token_id
+        self.eoc_token_id = eoc_token_id
+        self.media_token_id = media_token_id
+        self.use_media_placement_augmentation = use_media_placement_augmentation
+        self.vis_dim = vis_dim
+        self.lang_dim = lang_dim
+        # inner_dim = self.lang_dim * 4
+        # self.vis_proj = nn.Sequential(
+        #     nn.LayerNorm(self.vis_dim),
+        #     nn.Linear(self.vis_dim, inner_dim, bias=False),
+        #     nn.GELU(),
+        #     nn.Linear(inner_dim, self.lang_dim, bias=False),
+        # )
+        self.vis_proj = nn.Linear(self.vis_dim, self.lang_dim)
+        self.vision_encoder = vision_encoder
+        self.num_positions = vis_embed_size
+        self.lang_encoder = lang_encoder
+        self.lang_encoder.init_flamingo(
+            media_token_id=media_token_id,
+            use_media_placement_augmentation=self.use_media_placement_augmentation,
+        )
+        first_layer = self.lang_encoder._get_decoder_layers()[0]
+        first_layer.add_visual_token = add_visual_token
+        first_layer.visual_token_id = visual_token_id
+        first_layer.media_token_id = media_token_id
+        first_layer.box_token_id = box_token_id
+        # first_layer.pos_enc = PositionEncodingModule(self.lang_dim) if add_pe else None
+        # assert not (add_pe and add_relation)
+        # self.pos_enc = PositionEncodingModule(self.lang_dim) if add_pe else None
+        # first_layer.pos_enc = self.pos_enc
+        self.box_token_id = box_token_id
+        self.prebox_token_id = prebox_token_id
+        self.media_token_id = media_token_id
+        self.visual_token_id = visual_token_id
+        self.previsual_token_id = previsual_token_id
+        self.hidden_state_dim = hidden_state_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patch_num = self.image_size // self.patch_size
+        self.detection_head = YOLOXHead(
+            num_classes=1,
+            strides=[patch_size],
+            in_channels=[self.hidden_state_dim + self.lang_dim],
+        )
+        self.use_format_v2 = use_format_v2
+        self.nothing_token_id = nothing_token_id
+        self.roi_align = roi_align
+        self.roi_output_size = roi_output_size if roi_align else None
+        self.apply_mask = apply_mask
+        self.endofobject_token_id = endofobject_token_id
+    def _get_detection_batch(
+        self,
+        visual_token_id,
+        previsual_token_id,
+        input_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        added_bbox_list,
+        box_num = 100,
+    ):
+        select_mask = torch.logical_or(input_ids == visual_token_id, input_ids == previsual_token_id)
+        visual_token_position = select_mask.nonzero()
+        visual_token_hidden_states = hidden_states[select_mask]
+        prev_batch_idx = -1
+        media_idx = []
+        cnt = 0
+        assert len(visual_token_hidden_states) == len(visual_token_position)
+        if len(added_bbox_list) != len(visual_token_position):
+            msg = f"ERROR: {len(added_bbox_list)}:{len(visual_token_position)}\n{added_bbox_list}\n{visual_token_position}"
+            logging.info(msg)
+            alpha = 0.0
+        else:
+            alpha = 1.0
+        visual_batches = []
+        previsual_batches = []
+        for (batch_idx, idx), visual_token_hidden_state, bbox in zip(
+            visual_token_position, visual_token_hidden_states, added_bbox_list,
+        ):
+            # ! VERY IMPORTANT BUG !
+            bbox = bbox.clone()
+            # ! VERY IMPORTANT BUG !
+            batch_idx = batch_idx.item()
+            idx = idx.item()
+            if batch_idx != prev_batch_idx:
+                prev_batch_idx = batch_idx
+                this_input_ids = input_ids[batch_idx]
+                cnt += len(media_idx)
+                media_idx = (this_input_ids == self.media_token_id).nonzero().reshape(-1).tolist()
+            for i in range(len(media_idx)):
+                if i == len(media_idx) - 1 or idx > media_idx[i] and idx < media_idx[i+1]:
+                    break
+            image_index = cnt + i
+            size = int(self.image_embedding[image_index].shape[0] ** 0.5)
+            image_embedding = self.image_embedding[image_index]
+            # inplace xyxy2cxcywh
+            # print(bbox)
+            # TODO: CHECK self.image_size. Is it 224?
+            bbox = xyxy2cxcywh(bbox) * self.image_size
+            # print(bbox)
+            concat_image_visual_embedding = torch.cat([image_embedding, visual_token_hidden_state.unsqueeze(0).repeat(image_embedding.shape[0], 1)], dim=-1).reshape(size, size, -1)
+            label = torch.cat([torch.zeros(bbox.shape[0], 1, device=bbox.device), bbox], dim=-1)
+            label = torch.cat([label, torch.zeros(box_num - label.shape[0], label.shape[1], device=label.device)], dim=0)
+            if input_ids[batch_idx, idx] == previsual_token_id:
+                previsual_batches.append([concat_image_visual_embedding, label])
+            elif input_ids[batch_idx, idx] == visual_token_id:
+                visual_batches.append([concat_image_visual_embedding, label])
+            else:
+                logging.info(f"WARNING... NOT visual nor previsual. it is {input_ids[batch_idx, idx]}")
+        return visual_batches, previsual_batches, alpha, alpha
+    def get_detection_losses(
+        self,
+        input_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        added_bbox_list,
+        box_num = 100,
+    ):
+        visual_token_batches, previsual_token_batches, alpha1, alpha2 = self._get_detection_batch(
+            visual_token_id=self.visual_token_id,
+            previsual_token_id=self.previsual_token_id,
+            input_ids=input_ids,
+            hidden_states=hidden_states,
+            added_bbox_list=added_bbox_list,
+            box_num=box_num,
+        )
+        loss_dict = []
+        for batches, alpha in zip([visual_token_batches, previsual_token_batches], [alpha1, alpha2]):
+            # x: [B, C, H, W]
+            if len(batches) != 0:
+                x = torch.cat([batch[0].unsqueeze(0) for batch in batches], dim=0).permute(0,3,1,2)
+                labels = torch.cat([batch[1].unsqueeze(0) for batch in batches], dim=0)
+            else:
+                x = None
+                labels = None
+            if x is not None:
+                losses = self.detection_head(xin=[x], labels=labels)
+                loss, loss_iou, loss_obj, loss_cls, loss_l1, _ = losses
+            else:
+                loss = torch.tensor(0.0).cuda()
+                loss_iou = loss
+                loss_obj = loss
+                loss_cls = loss
+                loss_l1 = loss
+            loss_dict.append(dict(
+                loss=loss * alpha,
+                loss_iou=loss_iou * alpha,
+                loss_obj=loss_obj * alpha,
+                loss_cls=loss_cls * alpha,
+                loss_l1=loss_l1 * alpha,
+            ))
+        ret_loss = {}
+        for key in loss_dict[0].keys():
+            ret_loss[key] = 0.0
+            for d in loss_dict:
+                ret_loss[key] += d[key]
+        return ret_loss, loss_dict
+    def get_detection_result(
+        self,
+        input_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        nms_thr: float = 0.45,
+        score_thr: float = 0.01,
+        debug_id: int = 0,
+        debug_mode: bool = False,
+    ):
+        assert len(input_ids) == 1, "only batch size = 1 is supported yet"
+        # assert len(self.image_embedding) == 1, "only one image is supported yet"
+        # assert (input_ids[..., -1] == self.visual_token_id).all(), "the last token should be visual token"
+        visual_token_hidden_state = hidden_states[..., -1, :]
+        boxes_list = []
+        scores_list = []
+        for image_embedding in self.image_embedding:
+            size = int(image_embedding.shape[0] ** 0.5)
+            x = torch.cat([image_embedding, visual_token_hidden_state.repeat(image_embedding.shape[0], 1)], dim=-1).reshape(size, size, -1).unsqueeze(0).permute(0,3,1,2)
+            with torch.no_grad():
+                outputs = self.detection_head(xin=[x], labels=None)
+            boxes = outputs[0,:,:4].cpu().numpy()
+            scores = outputs[0,:,4].cpu().numpy()
+            scores_mask = scores > score_thr
+            boxes = boxes[scores_mask]
+            boxes = cxcywh2xyxy(boxes)
+            scores = scores[scores_mask]
+            keep = nms(boxes, scores, nms_thr=nms_thr)
+            boxes = boxes[keep]
+            scores = scores[keep]
+            if debug_mode:
+                obj_heatmap = outputs[0,:, -2].reshape(size, size).cpu().numpy()
+                import matplotlib.pyplot as plt
+                import seaborn as sns
+                plt.figure()
+                sns_plot = sns.heatmap(obj_heatmap)
+                plt.savefig(f"heatmap_{debug_id}.jpg")
+                debug_id += 1
+            boxes_list.append(boxes)
+            scores_list.append(scores)
+        if len(boxes_list) == 1:
+            boxes_list = boxes_list[0]
+            scores_list = scores_list[0]
+        return boxes_list, scores_list
+    def _condition_attention(self, loc_list = None):
+        for i in range(len(self.lang_encoder.gpt_neox.layers)):
+            self.lang_encoder.gpt_neox.layers[i].decoder_layer.attention.loc_list = loc_list
+    def forward(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        labels: torch.Tensor = None,
+        use_cached_vision_x: bool = False,
+        clear_conditioned_layers: bool = True,
+        past_key_values=None,
+        use_cache: bool = False,
+        image_nums=None,
+        image_start_index_list=None,
+        added_bbox_list=None,
+        add_box: bool = False,
+        relations=None,
+        debug_mode: bool = False,
+    ):
+        """
+        Forward pass of Flamingo.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W) with F=1
+            lang_x (torch.Tensor): Language input ids
+                shape (B, T_txt)
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            labels (torch.Tensor, optional): Labels. Defaults to None.
+            clear_conditioned_layers: if True, clear the conditioned layers
+                once the foward pass is completed. Set this to false if the
+                same set of images will be reused in another subsequent
+                forward pass.
+            past_key_values: pre-computed values to pass to language model.
+                See past_key_values documentation in Hugging Face
+                CausalLM models.
+            use_cache: whether to use cached key values. See use_cache
+                documentation in Hugging Face CausalLM models.
+        """
+        self.valid = True
+        self.lang_encoder.loc_list = None
+        if use_cached_vision_x:
+            # Case: use cached; vision_x should be cached and other
+            # vision-related inputs should not be provided.
+            assert (
+                vision_x is None
+            ), "Expect vision_x to be None when use_cached_vision_x is True."
+            assert self.lang_encoder.is_conditioned()
+        else:
+            # Case: do not use caching (i.e. this is a standard forward pass);
+            self._encode_vision_x(
+                vision_x=vision_x,
+                image_nums=image_nums,
+                image_start_index_list=image_start_index_list,
+                added_bbox_list=added_bbox_list if add_box else None,
+                input_ids=lang_x,
+                relations=relations,
+            )
+        if self.apply_mask:
+            if self.roi_align:
+                attend_length = 1 + self.roi_output_size ** 2
+            else:
+                attend_length = 2
+            prebox_loc = (lang_x == self.prebox_token_id).nonzero()
+            loc_list = []
+            for (x, y) in prebox_loc:
+                x = x.item()
+                y = y.item()
+                for yy in range(y+1, lang_x.shape[1]):
+                    if lang_x[x, yy] == self.endofobject_token_id:
+                        # [batch_idx, [previsual:prebox], [object:endofobject-1]]
+                        loc_list.append([x, [y-attend_length+1, y], [y+1, yy-1]])
+            self._condition_attention(loc_list=loc_list)
+        else:
+            self._condition_attention(None)
+        output = self.lang_encoder(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_hidden_states=True,
+        )
+        if vision_x is None:
+            output['loss'][0] += 0.0 * self.vis_proj(self.vision_encoder.visual(torch.randn(1, 3, 224, 224, device=lang_x.device, dtype=output['loss'].dtype))[1]).mean()
+        hidden_states = output["hidden_states"][-1]
+        if self.training and added_bbox_list is not None:
+            detection_losses, loss_dict = self.get_detection_losses(
+                input_ids=lang_x,
+                hidden_states=hidden_states,
+                added_bbox_list=added_bbox_list,
+            )
+            output["detection_losses"] = detection_losses
+            output["loss_dict"] = loss_dict
+        elif labels is None:
+            boxes, scores = self.get_detection_result(
+                input_ids=lang_x,
+                hidden_states=hidden_states,
+                debug_id=self.debug_id if hasattr(self, "debug_id") else None,
+                debug_mode=debug_mode,
+            )
+            output["boxes"] = boxes
+            output["scores"] = scores
+        if clear_conditioned_layers:
+            self.lang_encoder.clear_conditioned_layers()
+        self._condition_attention(None)
+        return output
+    def generate(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        added_bbox_list=None,
+        num_beams=1,
+        max_new_tokens=None,
+        temperature=1.0,
+        top_k=0,
+        top_p=1.0,
+        no_repeat_ngram_size=0,
+        prefix_allowed_tokens_fn=None,
+        length_penalty=1.0,
+        num_return_sequences=1,
+        do_sample=False,
+        early_stopping=False,
+        bad_words_ids=None,
+        force_words_ids=None,
+        image_start_index_list=None,
+        image_nums=None,
+        min_length=None,
+        return_dict_in_generate=False,
+        output_hidden_states=False,
+        output_scores=False,
+        logits_processor_list=None,
+        eos_token_id=None,
+    ):
+        """
+        Generate text conditioned on vision and language inputs.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                images in the same chunk are collated along T_img, and frames are collated along F
+                currently only F=1 is supported (single-frame videos)
+            lang_x (torch.Tensor): Language input
+                shape (B, T_txt)
+            max_length (int, optional): Maximum length of the output. Defaults to None.
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            num_beams (int, optional): Number of beams. Defaults to 1.
+            max_new_tokens (int, optional): Maximum new tokens. Defaults to None.
+            temperature (float, optional): Temperature. Defaults to 1.0.
+            top_k (int, optional): Top k. Defaults to 0.
+            top_p (float, optional): Top p. Defaults to 1.0.
+            no_repeat_ngram_size (int, optional): No repeat ngram size. Defaults to 0.
+            length_penalty (float, optional): Length penalty. Defaults to 1.0.
+            num_return_sequences (int, optional): Number of return sequences. Defaults to 1.
+            do_sample (bool, optional): Do sample. Defaults to False.
+            early_stopping (bool, optional): Early stopping. Defaults to False.
+        Returns:
+            torch.Tensor: lang_x with generated tokens appended to it
+        """
+        if num_beams > 1:
+            vision_x = vision_x.repeat_interleave(num_beams, dim=0)
+            image_start_index_list = torch.tensor(image_start_index_list).repeat_interleave(num_beams, dim=0).tolist()
+            image_nums = torch.tensor(image_nums).repeat_interleave(num_beams, dim=0).tolist()
+            if added_bbox_list is not None and len(added_bbox_list) != 0:
+                added_bbox_list = added_bbox_list * num_beams
+        self._encode_vision_x(vision_x=vision_x, image_nums=image_nums, image_start_index_list=image_start_index_list, num_beams=num_beams, added_bbox_list=added_bbox_list, input_ids=lang_x.repeat_interleave(num_beams, dim=0))
+        if logits_processor_list is not None:
+            assert isinstance(logits_processor_list, list)
+            logits_processor_list = LogitsProcessorList(logits_processor_list)
+        output = self.lang_encoder.generate(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            eos_token_id=(self.eoc_token_id) if eos_token_id is None else eos_token_id,
+            num_beams=num_beams,
+            max_new_tokens=max_new_tokens,
+            min_length=min_length,
+            length_penalty=length_penalty,
+            logits_processor=logits_processor_list,
+            return_dict_in_generate=return_dict_in_generate,
+            output_scores=output_scores,
+        )
+        self.lang_encoder.clear_conditioned_layers()
+        return output
+    def _get_data_list_and_visual_tokens(
+        self,
+        all_box_list,
+        box_token_id,
+        prebox_token_id,
+        input_ids,
+        vision_x,
+        nothing_embedding = None,
+    ):
+        box_locations = (torch.logical_or(input_ids == box_token_id, input_ids == prebox_token_id)).nonzero()
+        prev_batch_idx = -1
+        media_idx = []
+        cnt = 0
+        data_list = []
+        visual_tokens = []
+        if len(all_box_list) != len(box_locations):
+            logging.info(f"WARNING. len(all_box_list) != len(box_locations) {len(all_box_list)} vs {len(box_locations)}")
+            self.valid = False
+        for III, (batch_idx, idx) in enumerate(box_locations):
+            batch_idx = batch_idx.item()
+            idx = idx.item()
+            if batch_idx != prev_batch_idx:
+                prev_batch_idx = batch_idx
+                this_input_ids = input_ids[batch_idx]
+                cnt += len(media_idx)
+                media_idx = (this_input_ids == self.media_token_id).nonzero().reshape(-1).tolist()
+            for i in range(len(media_idx)):
+                if i == len(media_idx) - 1 or idx > media_idx[i] and idx < media_idx[i+1]:
+                    break
+            image_index = cnt + i
+            size = int(vision_x[image_index].shape[0] ** 0.5)
+            image_feature = vision_x[image_index].reshape(size, size, -1)
+            try:
+                raw_xyxy = all_box_list[III]
+            except:
+                logging.info("out of scope for all_box_list")
+                raw_xyxy = all_box_list[-1]
+            region_xyxy = np.array(raw_xyxy) * size
+            x1, y1, x2, y2 = region_xyxy.astype(int).clip(0, size-1).tolist()
+            x2 = max(x1, x2)
+            y2 = max(y1, y2)
+            if x1 + y1 + x2 + y2 == 0.0 and nothing_embedding is not None:
+                visual_token = nothing_embedding
+            else:
+                if self.roi_align:
+                    visual_token = torchvision.ops.roi_align(
+                        image_feature.permute(2, 0, 1).unsqueeze(0),
+                        [torch.tensor(region_xyxy.astype(np.float32)).unsqueeze(0).cuda()],
+                        output_size=self.roi_output_size,
+                        spatial_scale=1.0,
+                    )
+                    visual_token = visual_token.squeeze(0).flatten(1).permute(1, 0)
+                else:
+                    visual_token = image_feature[y1:y2+1, x1:x2+1].reshape(-1, image_feature.shape[-1]).mean(0)
+            box = torch.tensor([0] + raw_xyxy, device=visual_token.device, dtype=visual_token.dtype)
+            data_list.append([visual_token, box, batch_idx, idx, i])
+            visual_tokens.append(visual_token)
+        return data_list, visual_tokens
+    def _encode_vision_x(self, vision_x: torch.Tensor, image_nums=None, image_start_index_list=None, added_bbox_list=None, num_beams=None, input_ids=None, relations=None):
+        """
+        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        rearrange code based on https://github.com/dhansmair/flamingo-mini
+        """
+        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
+        b, T, F = vision_x.shape[:3]
+        assert F == 1, "Only single frame supported"
+        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
+        if hasattr(self.vision_encoder, "visual"):
+            vision_x = self.vision_encoder.visual(vision_x)[1]
+        else:
+            vision_x = self.vision_encoder(vision_x).flatten(2).permute(0, 2, 1)
+        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
+        # print(vision_x[0,0,0])
+        # # DEBUG HERE
+        # if torch.distributed.get_rank() == 0:
+        #     import pdb; pdb.set_trace()
+        # else:
+        #     torch.distributed.barrier()
+        vision_x = vision_x.mean(2)
+        # vision_x = self.perceiver(vision_x)  # reshapes to (b, T, n, d)
+        # vision_x = self.vis_proj(vision_x) + self.vis_position_embedding(self.vis_position_ids).unsqueeze(0)
+        vision_x = self.vis_proj(vision_x).squeeze(1)
+        self.image_embedding = vision_x
+        data_list = None
+        visual_tokens = None
+        if added_bbox_list is not None and input_ids is not None:
+            all_box_list = added_bbox_list[0].tolist()
+            for list in added_bbox_list[1:]:
+                all_box_list.extend(list.tolist())
+            data_list, visual_tokens = self._get_data_list_and_visual_tokens(
+                all_box_list=all_box_list,
+                box_token_id=self.box_token_id,
+                prebox_token_id=self.prebox_token_id,
+                input_ids=input_ids,
+                vision_x=vision_x,
+                nothing_embedding=self.lang_encoder.gpt_neox.embed_in(torch.tensor(self.nothing_token_id).to(self.lang_encoder.gpt_neox.embed_in.weight.device)) if self.nothing_token_id is not None else None,
+            )
+        first_layer = self.lang_encoder._get_decoder_layers()[0]
+        first_layer.condition_vis_x(vision_x, image_nums, image_start_index_list, num_beams=num_beams, visual_tokens=visual_tokens, data_list=[[d[2], d[3]] for d in data_list] if data_list is not None else data_list)

multimodal/build/lib/open_flamingo/src/flamingo_lm.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import random
+import torch
+import torch.nn as nn
+import numpy as np
+from .helpers import GatedCrossAttentionBlock
+from .utils import getattr_recursive, setattr_recursive
+class FlamingoLayer(nn.Module):
+    def __init__(self, decoder_layer):
+        super().__init__()
+        self.decoder_layer = decoder_layer
+        self.vis_x = None
+        self.image_nums = None
+        self.image_start_index_list = None
+        self.media_locations = None
+        self.add_visual_token = False
+        self.input_ids = None
+    def is_conditioned(self) -> bool:
+        """Check whether the layer is conditioned."""
+        return self.vis_x is not None
+    # Used this great idea from this implementation of Flamingo (https://github.com/dhansmair/flamingo-mini/)
+    def condition_vis_x(self, vis_x, image_nums=None, image_start_index_list=None, num_beams=None, visual_tokens=None, data_list=None):
+        self.vis_x = vis_x
+        self.image_nums = image_nums
+        self.image_start_index_list = image_start_index_list
+        self.num_beams = num_beams
+        self.visual_tokens = visual_tokens
+        self.data_list = data_list
+        self.input_ids = None
+    def condition_media_locations(self, media_locations):
+        self.media_locations = media_locations
+    def condition_attend_previous(self, attend_previous):
+        self.attend_previous = attend_previous
+    def forward(
+        self,
+        hidden_states, # alignment with hugging face name
+        attention_mask=None,
+        **decoder_layer_kwargs,
+    ):
+        if self.media_locations is None:
+            raise ValueError("media_locations must be conditioned before forward pass")
+        if self.vis_x is not None:
+            if self.training:
+                single_length = self.vis_x.shape[-2]
+                image_nums = self.image_nums
+                image_start_index_list = self.image_start_index_list
+                image_nums = [0] + np.cumsum(image_nums).tolist()
+                for i, (image_num_begin, image_num_end, start_indices) in enumerate(zip(image_nums[:-1], image_nums[1:], image_start_index_list)):
+                    for index in start_indices:
+                        if image_num_begin < image_num_end:
+                            hidden_states[i, index:index+single_length] = self.vis_x[image_num_begin]
+                            image_num_begin += 1
+                if self.visual_tokens is not None and len(self.visual_tokens) != 0:
+                    for i, (x, y) in enumerate(self.data_list):
+                        if len(self.visual_tokens[i].shape) > 1:
+                            # print(self.visual_tokens[i].shape[0], "embedding")
+                            hidden_states[x, y+1-self.visual_tokens[i].shape[0]:y+1] = self.visual_tokens[i]
+                        else:
+                            # print(self.visual_tokens[i].shape[0], "embedding")
+                            hidden_states[x, y] = self.visual_tokens[i]
+            elif not self.training:
+                if (
+                    ("past_key_value" in decoder_layer_kwargs and decoder_layer_kwargs["past_key_value"] is None) or
+                    ("layer_past" in decoder_layer_kwargs and decoder_layer_kwargs["layer_past"] is None)
+                ):
+                    single_length = self.vis_x.shape[-2]
+                    image_nums = self.image_nums
+                    image_start_index_list = self.image_start_index_list
+                    image_nums = [0] + np.cumsum(image_nums).tolist()
+                    for i, (image_num_begin, image_num_end, start_indices) in enumerate(zip(image_nums[:-1], image_nums[1:], image_start_index_list)):
+                        for index in start_indices:
+                            if image_num_begin < image_num_end:
+                                hidden_states[i, index:index+single_length] = self.vis_x[image_num_begin]
+                                image_num_begin += 1
+                    if self.visual_tokens is not None and len(self.visual_tokens) != 0:
+                        for i, (x, y) in enumerate(self.data_list):
+                            # import pdb; pdb.set_trace()
+                            # print(x, y, self.visual_tokens[i].shape)
+                            if len(self.visual_tokens[i].shape) > 1:
+                                # print(self.visual_tokens[i].shape[0], "embedding")
+                                hidden_states[x, y+1-self.visual_tokens[i].shape[0]:y+1] = self.visual_tokens[i]
+                            else:
+                                # print(self.visual_tokens[i].shape[0], "embedding")
+                                hidden_states[x, y] = self.visual_tokens[i]
+        hidden_states = self.decoder_layer(
+            hidden_states, attention_mask=attention_mask, **decoder_layer_kwargs
+        )
+        return hidden_states
+class FlamingoLMMixin(nn.Module):
+    """
+    Mixin to add cross-attention layers to a language model.
+    """
+    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
+        self.decoder_layers_attr_name = decoder_layers_attr_name
+    def _get_decoder_layers(self):
+        return getattr_recursive(self, self.decoder_layers_attr_name)
+    def _set_decoder_layers(self, value):
+        setattr_recursive(self, self.decoder_layers_attr_name, value)
+    def init_flamingo(
+        self,
+        media_token_id,
+        use_media_placement_augmentation,
+    ):
+        """
+        Initialize Flamingo by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
+        """
+        self._set_decoder_layers(
+            nn.ModuleList(
+                [FlamingoLayer(decoder_layer) for decoder_layer in self._get_decoder_layers()]
+            )
+        )
+        self.media_token_id = media_token_id
+        self.use_media_placement_augmentation = use_media_placement_augmentation
+        self.initialized_flamingo = True
+    def forward(self, *input, **kwargs):
+        """Condition the Flamingo layers on the media locations before forward()"""
+        if not self.initialized_flamingo:
+            raise ValueError(
+                "Flamingo layers are not initialized. Please call `init_flamingo` first."
+            )
+        input_ids = kwargs["input_ids"] if "input_ids" in kwargs else input[0]
+        media_locations = input_ids == self.media_token_id
+        attend_previous = (
+            (random.random() < 0.5) if self.use_media_placement_augmentation else True
+        )
+        if (
+            "gpt2" in self.__class__.__name__.lower()
+            or "codegen" in self.__class__.__name__.lower()
+        ):
+            for layer in self.transformer.h:
+                layer.condition_media_locations(media_locations)
+                layer.condition_attend_previous(attend_previous)
+        elif "gptneox" in self.__class__.__name__.lower():
+            for layer in self.gpt_neox.layers:
+                layer.condition_media_locations(media_locations)
+                layer.condition_attend_previous(attend_previous)
+        else:
+            for layer in self.get_decoder().layers:
+                layer.condition_media_locations(media_locations)
+                layer.condition_attend_previous(attend_previous)
+        return super().forward(
+            *input, **kwargs
+        )  # Call the other parent's forward method
+    def is_conditioned(self) -> bool:
+        """Check whether all decoder layers are already conditioned."""
+        return all(l.is_conditioned() for l in self._get_decoder_layers())
+    def clear_conditioned_layers(self):
+        for layer in self._get_decoder_layers():
+            layer.condition_vis_x(None)
+            layer.condition_media_locations(None)
+            layer.condition_attend_previous(None)

multimodal/build/lib/open_flamingo/src/gcn.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+import math
+from torch.autograd import Variable
+from torchvision.ops import box_iou
+class GraphConvolution(nn.Module):
+    """
+    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
+    """
+    def __init__(self, in_features, out_features, bias=True, skip=True):
+        super(GraphConvolution, self).__init__()
+        self.skip = skip
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = Parameter(torch.Tensor(in_features, out_features))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        stdv = 1. / math.sqrt(self.weight.size(1))
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.uniform_(-stdv, stdv)
+    def forward(self, input, adj):
+        # TODO make fc more efficient via "pack_padded_sequence"
+        # import ipdb; ipdb.set_trace()
+        support = torch.bmm(input, self.weight.unsqueeze(
+            0).expand(input.shape[0], -1, -1))
+        output = torch.bmm(adj, support)
+        #output = SparseMM(adj)(support)
+        if self.bias is not None:
+            output += self.bias.unsqueeze(0).expand(input.shape[0], -1, -1)
+        if self.skip:
+            output += support
+        return output
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' \
+            + str(self.in_features) + ' -> ' \
+            + str(self.out_features) + ')'
+class GCN_sim(nn.Module):
+    def __init__(self, dim_in, dim_hidden, dim_out, dropout, num_layers):
+        super(GCN_sim, self).__init__()
+        assert num_layers >= 1
+        self.fc_k = nn.Linear(dim_in, dim_hidden)
+        self.fc_q = nn.Linear(dim_in, dim_hidden)
+        dim_hidden = dim_out if num_layers == 1 else dim_hidden
+        self.gcs = nn.ModuleList([
+            GraphConvolution(dim_in, dim_hidden)
+        ])
+        for i in range(num_layers - 1):
+            dim_tmp = dim_out if i == num_layers-2 else dim_hidden
+            self.gcs.append(GraphConvolution(dim_hidden, dim_tmp))
+        self.dropout = dropout
+    def construct_graph(self, x, length):
+        # TODO make fc more efficient via "pack_padded_sequence"
+        emb_k = self.fc_k(x)
+        emb_q = self.fc_q(x)
+        s = torch.bmm(emb_k, emb_q.transpose(1, 2))
+        s_mask = s.data.new(*s.size()).fill_(1).bool()  # [B, T1, T2]
+        # Init similarity mask using lengths
+        for i, (l_1, l_2) in enumerate(zip(length, length)):
+            s_mask[i][:l_1, :l_2] = 0
+        s_mask = Variable(s_mask)
+        s.data.masked_fill_(s_mask.data, -float("inf"))
+        a_weight = F.softmax(s, dim=2)  # [B, t1, t2]
+        # remove nan from softmax on -inf
+        a_weight.data.masked_fill_(a_weight.data != a_weight.data, 0)
+        return a_weight
+    def forward(self, x, length):
+        adj_sim = self.construct_graph(x, length)
+        for gc in self.gcs:
+            x = F.relu(gc(x, adj_sim))
+            x = F.dropout(x, self.dropout, training=self.training)
+        return x
+class GCN(nn.Module):
+    def __init__(self, dim_in, dim_hidden, dim_out, dropout, mode, skip, num_layers, ST_n_next=None):
+        super(GCN, self).__init__()
+        assert len(mode) != 0
+        self.mode = mode
+        self.skip = skip
+        if "GCN_sim" in mode:
+            self.GCN_sim = GCN_sim(
+                dim_in, dim_hidden, dim_out, dropout, num_layers)
+    def forward(self, x, length):
+        out = []
+        if "GCN_sim" in self.mode:
+            out.append(self.GCN_sim(x, length))
+        out = sum(out)
+        if self.skip:
+            out += x
+        return out
+if __name__ == '__main__':
+    model = GCN(512, 128, 512, 0.5, mode=[
+                "GCN_sim"], skip=True, num_layers=3, ST_n_next=3)
+    bs, T, N = 10, 5, 10
+    n_node = T*N
+    input = torch.rand(bs, n_node, 512)
+    length = torch.ones((bs))
+    length = length.type(torch.IntTensor)
+    bboxes = torch.rand((bs, 5, 10, 4))
+    output = model(input, length)

multimodal/build/lib/open_flamingo/src/helpers.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+Taken from https://github.com/lucidrains/flamingo-pytorch
+"""
+import torch
+from einops import rearrange, repeat
+from einops_exts import rearrange_many
+from torch import einsum, nn
+def exists(val):
+    return val is not None
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, T, n2, D)
+        """
+        x = self.norm_media(x)
+        latents = self.norm_latents(latents)
+        h = self.heads
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
+        q = q * self.scale
+        # attention
+        sim = einsum("... i d, ... j d  -> ... i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
+        return self.to_out(out)
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=6,
+        dim_head=64,
+        heads=8,
+        num_latents=64,
+        max_num_media=None,
+        max_num_frames=None,
+        ff_mult=4,
+    ):
+        super().__init__()
+        assert False, "Do not use PerceiverResampler"
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        self.frame_embs = (
+            nn.Parameter(torch.randn(max_num_frames, dim))
+            if exists(max_num_frames)
+            else None
+        )
+        self.media_time_embs = (
+            nn.Parameter(torch.randn(max_num_media, 1, dim))
+            if exists(max_num_media)
+            else None
+        )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, F, v, D)
+        Returns:
+            shape (b, T, n, D) where n is self.num_latents
+        """
+        b, T, F, v = x.shape[:4]
+        # frame and media time embeddings
+        if exists(self.frame_embs):
+            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
+            x = x + frame_embs
+        x = rearrange(
+            x, "b T F v d -> b T (F v) d"
+        )  # flatten the frame and spatial dimensions
+        if exists(self.media_time_embs):
+            x = x + self.media_time_embs[:T]
+        # blocks
+        latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        return self.norm(latents)
+# gated cross attention
+class MaskedCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+        # whether for text to only attend to immediate preceding image, or all previous images
+        self.only_attend_immediate_media = only_attend_immediate_media
+    def forward(self, x, media, media_locations=None, attend_previous=True):
+        """
+        Args:
+            x (torch.Tensor): text features
+                shape (B, T_txt, D_txt)
+            media (torch.Tensor): image features
+                shape (B, T_img, n, D_img) where n is the dim of the latents
+            media_locations: boolean mask identifying the media tokens in x
+                shape (B, T_txt)
+            attend_previous: bool
+                If false, ignores immediately preceding image and starts attending when following image
+        """
+        assert attend_previous, "text must attend to the image that before it"
+        _, T_img, n = media.shape[:3]
+        h = self.heads
+        x = self.norm(x)
+        q = self.to_q(x)
+        media = rearrange(media, "b t n d -> b (t n) d")
+        k, v = self.to_kv(media).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
+        q = q * self.scale
+        sim = einsum("... i d, ... j d -> ... i j", q, k)
+        if exists(media_locations):
+            # at each boolean of True, increment the time counter (relative to media time)
+            text_time = media_locations.cumsum(dim=-1)
+            media_time = torch.arange(T_img, device=x.device) + 1
+            if not attend_previous:
+                text_time[~media_locations] += 1
+                # make sure max is still the number of images in the sequence
+                text_time[
+                    text_time
+                    > repeat(
+                        torch.count_nonzero(media_locations, dim=1),
+                        "b -> b i",
+                        i=text_time.shape[1],
+                    )
+                ] = 0
+            # text time must equal media time if only attending to most immediate image
+            # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
+            mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
+            text_to_media_mask = mask_op(
+                rearrange(text_time, "b i -> b 1 i 1"),
+                repeat(media_time, "j -> 1 1 1 (j n)", n=n),
+            )
+            sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        if exists(media_locations) and self.only_attend_immediate_media:
+            # any text without a preceding media needs to have attention zeroed out
+            text_without_media_mask = text_time == 0
+            text_without_media_mask = rearrange(
+                text_without_media_mask, "b i -> b 1 i 1"
+            )
+            attn = attn.masked_fill(text_without_media_mask, 0.0)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class GatedCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.attn = MaskedCrossAttention(
+            dim=dim,
+            dim_visual=dim_visual,
+            dim_head=dim_head,
+            heads=heads,
+            only_attend_immediate_media=only_attend_immediate_media,
+        )
+    def forward(
+        self,
+        x,
+        media,
+        media_locations=None,
+        attend_previous=True,
+    ):
+        x = self.attn(x, media, media_locations=media_locations, attend_previous=attend_previous) + x
+        return x

multimodal/build/lib/open_flamingo/src/utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)

multimodal/build/lib/open_flamingo/train/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

multimodal/build/lib/open_flamingo/train/data2.py ADDED Viewed

	@@ -0,0 +1,868 @@

+import functools
+import logging
+import math
+import random
+import sys
+from dataclasses import dataclass
+from multiprocessing import Value
+import time
+import os
+import numpy as np
+import pickle as pkl
+from open_flamingo.train.instruction_template import (
+    VG_RELATION_TEMPLATES,
+    PISC_TEMPLATES,
+)
+import torch
+import webdataset as wds
+from PIL import Image
+from torch.utils.data import DataLoader, IterableDataset, get_worker_info
+from torch.utils.data.distributed import DistributedSampler
+from webdataset.tariterators import (
+    base_plus_ext,
+    tar_file_expander,
+    url_opener,
+    valid_sample,
+)
+from groundingdino.demo.caption_grounder import caption_grounder
+from groundingdino.demo.inference_on_laion import add_loc_to_text
+from groundingdino.demo.inference_on_laion import nms_without_score
+from groundingdino.demo.inference_on_laion import calculate_iou
+Image.MAX_IMAGE_PIXELS = 1000000000
+LAION2B_NUM_SAMPLE = 1500000000
+VQAV2_TRAIN_NUM_SAMPLE = 1828467
+VG_RELATION_BBOX_SIZE = 600
+REL_LABELS = ['__background__', 'above', 'across', 'against', 'along', 'and', 'at', 'attached to', 'behind', 'belonging to', 'between', 'carrying', 'covered in', 'covering', 'eating', 'flying in', 'for', 'from', 'growing on', 'hanging from', 'has', 'holding', 'in', 'in front of', 'laying on', 'looking at', 'lying on', 'made of', 'mounted on', 'near', 'of', 'on', 'on back of', 'over', 'painted on', 'parked on', 'part of', 'playing', 'riding', 'says', 'sitting on', 'standing on', 'to', 'under', 'using', 'walking in', 'walking on', 'watching', 'wearing', 'wears', 'with']
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+class ConcatDataset(IterableDataset):
+    def __init__(
+            self, dataset, max_length,
+            delimiter_id, pad_id=None, media_id=None, endofmedia_id=None,
+            image_embedding_size=-2, single=False, box_id=None, visual_id=None,
+        ):
+        self.dataset = dataset
+        self.max_length = max_length
+        self.delimiter_id = torch.ones(1,1).long() * delimiter_id
+        if pad_id is not None:
+            self.pad_id = int(pad_id)
+        if media_id is not None:
+            self.media_id = torch.ones(1,1).long() * int(media_id)
+        if endofmedia_id is not None:
+            self.endofmedia_id = torch.ones(1,1).long() * int(endofmedia_id)
+        if image_embedding_size > 0:
+            logging.info(f"image_embedding_size: {image_embedding_size}")
+        self.image_embedding_size = image_embedding_size + 2
+        self.single = single
+        self.box_id = box_id
+        self.visual_id = visual_id
+    def __iter__(self):
+        while True:
+            input_ids_list = []
+            attention_mask_list = []
+            image_list = []
+            image_start_index_list = []
+            added_bbox_list = []
+            relations_list = []
+            cnt = 0
+            while cnt < self.max_length:
+                sample = next(self.dataset)
+                if len(sample) >= 4:
+                    image = sample[0].unsqueeze(0)
+                    input_ids = sample[1]
+                    attention_mask = sample[2]
+                    added_bbox = sample[3]
+                    image_list.append(image)
+                    added_bbox_list.append(added_bbox)
+                    if len(sample) == 5:
+                        relations_list.append(sample[4])
+                else:
+                    sample = sample[0]
+                    input_ids = sample[0]
+                    attention_mask = sample[1]
+                input_ids_list.append(input_ids)
+                attention_mask_list.append(attention_mask)
+                cnt += input_ids.shape[-1]
+                if self.single:
+                    break
+            input_ids = torch.cat(input_ids_list, dim=-1)[0]
+            attention_mask = torch.cat(attention_mask_list, dim=-1)[0]
+            if not self.single:
+                input_ids = input_ids[:self.max_length]
+                attention_mask = attention_mask[:self.max_length]
+            # TODO: fix visual number not match
+            if len(image_list) != 0:
+                images = torch.cat(image_list, dim=0)
+                image_begin = (input_ids == self.media_id[0,0]).nonzero().view(-1)
+                image_end = (input_ids == self.endofmedia_id[0,0]).nonzero().view(-1)
+                if len(image_begin) != len(image_end):
+                    assert len(image_begin) == len(image_end) + 1
+                    input_ids[image_begin[-1]:] = self.pad_id
+                    attention_mask[image_begin[-1]:] = 0
+                    image_begin = image_begin[:-1]
+                eos_token_num = len((input_ids == self.delimiter_id[0,0]).nonzero().view(-1))
+                if eos_token_num != len(image_begin) + 1:
+                    input_ids[image_begin[-1]:] = self.pad_id
+                    attention_mask[image_begin[-1]:] = 0
+                    image_begin = image_begin[:-1]
+                    image_end = image_end[:-1]
+                images = images[:len(image_end)]
+                added_bbox_list = added_bbox_list[:len(image_end)]
+                relations_list = relations_list[:len(image_end)]
+                image_start_index_list = (image_begin + 1).tolist()
+                expand_list = added_bbox_list[0]
+                for x in added_bbox_list[1:]:
+                    expand_list.extend(x)
+                yield images, len(images), image_start_index_list, input_ids, attention_mask, expand_list, relations_list
+            else:
+                yield input_ids, attention_mask
+class SharedEpoch:
+    def __init__(self, epoch: int = 0):
+        self.shared_epoch = Value("i", epoch)
+    def set_value(self, epoch):
+        self.shared_epoch.value = epoch
+    def get_value(self):
+        return self.shared_epoch.value
+@dataclass
+class DataInfo:
+    dataloader: DataLoader
+    sampler: DistributedSampler = None
+    shared_epoch: SharedEpoch = None
+    def set_epoch(self, epoch):
+        if self.shared_epoch is not None:
+            self.shared_epoch.set_value(epoch)
+        if self.sampler is not None and isinstance(self.sampler, DistributedSampler):
+            self.sampler.set_epoch(epoch)
+def filter_no_caption_or_no_image(sample):
+    return ("txt" in sample) and (
+        "png" in sample or "jpg" in sample or "jpeg" in sample
+    )
+def log_and_continue(exn):
+    """Call in an exception handler to ignore any exception, issue a warning, and continue."""
+    if "ValueError" in repr(exn) or "KeyError" in repr(exn):  # Avoid spamming logs with these
+        return True
+    logging.warning(f"Handling webdataset error ({repr(exn)}). Ignoring.")
+    return True
+# DEBUG
+# log_and_continue = None
+# DEBUG
+def group_by_keys_nothrow(
+    data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None
+):
+    """Return function over iterator that groups key, value pairs into samples.
+    :param keys: function that splits the key into key and extension (base_plus_ext)
+    :param lcase: convert suffixes to lower case (Default value = True)
+    """
+    current_sample = None
+    tar_idx = None
+    for filesample in data:
+        assert isinstance(filesample, dict)
+        current_tar_idx = filesample["__url__"].split("/")[-1].split(".")[0]
+        if current_tar_idx != tar_idx:
+            tar_idx = current_tar_idx
+            if "blip2_all_data_ground" in filesample["__url__"]:
+                relation_data_dir = os.path.join("/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/blip2_all_data_relation", tar_idx)
+                missing_file = False
+                try:
+                    data_info = pkl.load(open(os.path.join(relation_data_dir, "custom_data_info.pkl"), "rb"))
+                    prediction = pkl.load(open(os.path.join(relation_data_dir, "custom_prediction.pkl"), "rb"))
+                    idx_to_files = data_info["idx_to_files"]
+                    ind_to_classes = data_info["ind_to_classes"]
+                    ind_to_predicates = data_info["ind_to_predicates"]
+                    files_to_idx = {x.split("#")[-1]: i for i, x in enumerate(idx_to_files)}
+                except:
+                    missing_file = True
+        fname, value = filesample["fname"], filesample["data"]
+        prefix, suffix = keys(fname)
+        if prefix is None:
+            continue
+        if lcase:
+            suffix = suffix.lower()
+        # FIXME webdataset version throws if suffix in current_sample, but we have a potential for
+        #  this happening in the current LAION400m dataset if a tar ends with same prefix as the next
+        #  begins, rare, but can happen since prefix aren't unique across tar files in that dataset
+        if (
+            current_sample is None
+            or prefix != current_sample["__key__"]
+            or suffix in current_sample
+        ):
+            if valid_sample(current_sample):
+                yield current_sample
+            current_sample = dict(__key__=prefix, __url__=filesample["__url__"])
+            if "blip2_all_data_ground" in filesample["__url__"] and not missing_file:
+                try:
+                    idx = files_to_idx[prefix]
+                    prediction[idx]["bbox"] = [np.array(bbox)/VG_RELATION_BBOX_SIZE for bbox in prediction[idx]["bbox"]]
+                    current_sample["relation_data"] = prediction[idx]
+                except:
+                    current_sample["relation_data"] = dict()
+            else:
+                current_sample["relation_data"] = dict()
+        if suffixes is None or suffix in suffixes:
+            current_sample[suffix] = value
+    if valid_sample(current_sample):
+        yield current_sample
+def tarfile_to_samples_nothrow(src, handler=log_and_continue):
+    # NOTE this is a re-impl of the webdataset impl with group_by_keys that doesn't throw
+    streams = url_opener(src, handler=handler)
+    files = tar_file_expander(streams, handler=handler)
+    samples = group_by_keys_nothrow(files, handler=handler)
+    return samples
+def pytorch_worker_seed(increment=0):
+    """get dataloader worker seed from pytorch"""
+    worker_info = get_worker_info()
+    if worker_info is not None:
+        # favour using the seed already created for pytorch dataloader workers if it exists
+        seed = worker_info.seed
+        if increment:
+            # space out seed increments so they can't overlap across workers in different iterations
+            seed += increment * max(1, worker_info.num_workers)
+        return seed
+    # fallback to wds rank based seed
+    return wds.utils.pytorch_worker_seed()
+_SHARD_SHUFFLE_SIZE = 2000
+_SHARD_SHUFFLE_INITIAL = 500
+_SAMPLE_SHUFFLE_SIZE = 5000
+_SAMPLE_SHUFFLE_INITIAL = 1000
+class ResampledShards2(IterableDataset):
+    """An iterable dataset yielding a list of urls."""
+    def __init__(
+        self,
+        urls,
+        nshards=sys.maxsize,
+        worker_seed=None,
+        deterministic=False,
+        epoch=-1,
+    ):
+        """Sample shards from the shard list with replacement.
+        :param urls: a list of URLs as a Python list or brace notation string
+        """
+        super().__init__()
+        urls = wds.shardlists.expand_urls(urls)
+        self.urls = urls
+        assert isinstance(self.urls[0], str)
+        self.nshards = nshards
+        self.rng = random.Random()
+        self.worker_seed = worker_seed
+        self.deterministic = deterministic
+        self.epoch = epoch
+    def __iter__(self):
+        """Return an iterator over the shards."""
+        if isinstance(self.epoch, SharedEpoch):
+            epoch = self.epoch.get_value()
+        else:
+            # NOTE: this is epoch tracking is problematic in a multiprocess (dataloader workers or train)
+            # situation as different workers may wrap at different times (or not at all).
+            self.epoch += 1
+            epoch = self.epoch
+        if self.deterministic:
+            # reset seed w/ epoch if deterministic
+            if self.worker_seed is None:
+                # pytorch worker seed should be deterministic due to being init by arg.seed + rank + worker id
+                seed = pytorch_worker_seed(epoch)
+            else:
+                seed = self.worker_seed() + epoch
+            seed = seed + int(time.time())
+            self.rng.seed(seed)
+            # logging.info(f"epoch: {epoch} seed: {seed}")
+        self.rng.shuffle(self.urls)
+        # logging.info(f"{len(self.urls)} | {self.urls[:2]}")
+        for url in self.urls:
+            # logging.info(f"{seed}: {url}")
+            yield dict(url=url)
+def preprocess_image(sample, image_processor):
+    image = image_processor(sample)
+    return image
+def preprocess_text(sample, tokenizer, max_length, single=False):
+    if not single:
+        text = tokenizer(tokenizer.bos_token+sample.strip(), return_tensors="pt", max_length=max_length, truncation=True)
+    else:
+        text = tokenizer(tokenizer.bos_token+sample.strip(), return_tensors="pt", max_length=max_length, truncation=True, padding='max_length')
+    return text["input_ids"], text["attention_mask"]
+def preprocess_encoded_text(sample, tokenizer, max_length):
+    sample = sample.decode("utf-8")
+    return preprocess_text(sample, tokenizer, max_length=max_length)
+def _merge_bbox_previsual(added_bbox_list):
+    bbox_list = []
+    for bboxes in added_bbox_list:
+        x1 = bboxes[:, 0].min()
+        y1 = bboxes[:, 1].min()
+        x2 = bboxes[:, 2].max()
+        y2 = bboxes[:, 3].max()
+        bbox_list.append(torch.tensor([x1, y1, x2, y2], device=bboxes.device, dtype=bboxes.dtype).unsqueeze(0))
+    return bbox_list
+def _find_idx(text, subtext):
+    loc = 0
+    locs = []
+    while text.find(subtext, loc) != -1:
+        loc = text.find(subtext, loc)
+        locs.append(loc)
+        loc += len(subtext)
+    return locs
+def preprocess_ground_caption(sample, image_processor, tokenizer, image_embedding_size, generator, prob_ground=1.0, single=False, use_format_v2=False, add_visual_token=False, max_length=None, args=None):
+    assert max_length is not None
+    assert not single, "single is not supported for preprocess_ground_caption"
+    image, caption, logits_filt, boxes_filt, relation_data = sample
+    if len(logits_filt.shape) == 1 and logits_filt.shape[0] == 4 and len(boxes_filt.shape) == 1 and boxes_filt.shape[0] == 4:
+        raise NotImplementedError # lack relation data
+        return preprocess_visual_genome(sample=sample, image_processor=image_processor, tokenizer=tokenizer, image_embedding_size=image_embedding_size, prob_ground=prob_ground, single=single, use_format_v2=use_format_v2, add_visual_token=add_visual_token, max_length=max_length)
+    image = preprocess_image(image, image_processor=image_processor)
+    added_bbox = []
+    if (prob_ground != 0 and random.random() <= prob_ground) or prob_ground == 1.0:
+        boxes_filt, pred_phrases = generator.postprocess(logits_filt, boxes_filt, generator.ground_model, caption, generator.text_threshold, generator.box_threshold, with_logits=True)
+        caption, added_bbox = add_loc_to_text(
+            boxes_filt, pred_phrases, caption,
+            expand=args.expand, always_expand=args.longer_previsual,
+        )
+    visual_loc = []
+    obj_loc = []
+    endofobj_loc = []
+    visual_token = "<|#visual#|>"
+    previsual_token = "<|#previsual#|>"
+    box_token = "<|#box#|>"
+    prebox_token = "<|#prebox#|>"
+    end_token = "<|#endofobject#|>"
+    object_token = "<|#object#|>"
+    end_of_attr_token = "<|#endofattr#|>"
+    preend_of_attr_token = "<|#preendofattr#|>"
+    visual_loc = _find_idx(caption, visual_token)
+    try:
+        if len(visual_loc) != len(added_bbox):
+            logging.warning(f"visual_loc: {visual_loc}")
+            logging.warning(f"added_bbox: {added_bbox}")
+    except:
+        pass
+    assert len(visual_loc) == len(added_bbox)
+    delta = 0
+    for i, (loc, boxes) in enumerate(zip(visual_loc, added_bbox)):
+        loc += delta
+        boxes = nms_without_score(boxes)
+        added_bbox[i] = boxes
+        added_tokens = end_token + visual_token + box_token * len(boxes) + end_of_attr_token
+        caption = caption[:loc] + added_tokens + caption[len(visual_token) + loc:]
+        delta += len(added_tokens) - len(visual_token)
+    if use_format_v2:
+        merge_added_bbox = _merge_bbox_previsual(added_bbox)
+        # step 1: move <|#object#|> before the space char
+        while caption.find(f" {object_token}") != -1:
+            caption = caption.replace(f" {object_token}", f"{object_token} ")
+        # step 2: add <|#previsual#|> after <|#object#|> for 75% except the first object
+        i = 0
+        II = -1
+        if args.no_visual:
+            flag = False
+            delete_visual_prob = 10.0
+        else:
+            flag = True
+            delete_visual_prob = 0.75
+        while i < len(caption):
+            if caption[i: i + len(object_token)] == object_token:
+                II += 1
+                if (not args.longer_previsual and not flag and random.random() < delete_visual_prob) or (args.longer_previsual and (flag or random.random() < delete_visual_prob)):
+                    # delete visual and add previsual
+                    visual_start_idx = caption.find(end_token, i+1) + len(end_token)
+                    visual_end_idx = caption.find(end_of_attr_token, visual_start_idx+1) + len(end_of_attr_token)
+                    caption = caption[:visual_start_idx] + caption[visual_end_idx:]
+                    caption = caption[:i + len(object_token)] + previsual_token + prebox_token + preend_of_attr_token + caption[i + len(object_token):]
+                    added_bbox[II] = merge_added_bbox[II]
+            i += 1
+            flag = False
+        if args.no_previsual and args.no_visual:
+            caption = caption.replace(previsual_token, "").replace(prebox_token, "").replace(preend_of_attr_token, "")
+            added_bbox = []
+        caption = caption.replace(preend_of_attr_token, object_token).replace(end_of_attr_token, end_token)
+    if args.roi_align:
+        i = 0
+        pad_num = args.roi_output_size ** 2 - 1
+        while i < len(caption):
+            if caption[i: i + len(prebox_token)] == prebox_token:
+                caption = caption[:i] + tokenizer.pad_token * pad_num + caption[i:]
+                i += len(tokenizer.pad_token) * pad_num + len(prebox_token)
+            elif caption[i: i + len(box_token)] == box_token:
+                caption = caption[:i] + tokenizer.pad_token * pad_num + caption[i:]
+                i += len(tokenizer.pad_token) * pad_num + len(box_token)
+            i += 1
+    caption = f"<|#image#|>{tokenizer.pad_token*image_embedding_size}<|#endofimage#|>" + caption
+    input_ids, attention_mask = preprocess_text(caption, tokenizer, max_length=max_length)
+    relations = []
+    if args.only_grounded_sample and "<|#visual#|>" not in caption:
+        raise ValueError
+    return image, input_ids, attention_mask, added_bbox, relations
+def preprocess_visual_genome(sample, image_processor, tokenizer, image_embedding_size, prob_ground=1.0, single=False, use_format_v2=False, add_visual_token=False, max_length=None):
+    assert max_length is not None
+    assert not single, "single is not supported for preprocess_ground_caption"
+    image, caption, xyxy, _ = sample
+    image = preprocess_image(image, image_processor=image_processor)
+    caption = f"<|#image#|>{tokenizer.pad_token*image_embedding_size}<|#endofimage#|><|#object#|>" + caption.strip() + "<|#endofobject#|><|#visual#|><|#box#|><|#endofattr#|>"
+    input_ids, attention_mask = preprocess_text(caption, tokenizer, max_length=max_length)
+    added_bbox = [torch.tensor(np.expand_dims(xyxy, 0).astype(np.float32) / 224)]
+    return image, input_ids, attention_mask, added_bbox
+special_predicate = [
+    "and",
+    "has",
+    "says",
+    "wears",
+]
+original_predicate = {
+    "and": "and",
+    "has": "have",
+    "says": "say",
+    "wears": "wear",
+}
+def generate_vg_relation_sample(boxA, boxB, nameA, nameB, relation):
+    if relation in ["and", "of"]:
+        id = 0
+    else:
+        id = random.choice(range(len(VG_RELATION_TEMPLATES)))
+    text = VG_RELATION_TEMPLATES[id].format(nameA=nameA, nameB=nameB, relation=relation, use_is="is" if relation not in special_predicate else "", is_or_does="is" if relation not in special_predicate else "does", relation_do=relation if relation not in special_predicate else original_predicate[relation])
+    if id in [0]:
+        added_bbox = [
+            torch.tensor([boxA]),
+            torch.tensor([boxB]),
+        ]
+    elif id in [1]:
+        added_bbox = [
+            torch.tensor([boxA]),
+            torch.tensor([boxB]),
+            torch.tensor([boxA]),
+            torch.tensor([boxB]),
+        ]
+    elif id in [2]:
+        added_bbox = [
+            torch.tensor([boxA]),
+            torch.tensor([boxA]),
+            torch.tensor([boxB]),
+        ]
+    elif id in [3]:
+        added_bbox = [
+            torch.tensor([boxB]),
+            torch.tensor([boxA]),
+            torch.tensor([boxB]),
+        ]
+    elif id in [4]:
+        added_bbox = [
+            torch.tensor([boxA]),
+            torch.tensor([boxB]),
+        ]
+    elif id in [5]:
+        added_bbox = [
+            torch.tensor([boxB]),
+            torch.tensor([boxA]),
+        ]
+    else:
+        raise NotImplementedError
+    return text, added_bbox
+def generate_pisc_sample(boxA, boxB, relation):
+    id = random.choice(range(len(PISC_TEMPLATES)))
+    text = PISC_TEMPLATES[id].format(relation=relation)
+    if id in [0]:
+        if random.random() < 0.5:
+            added_bbox = [
+                torch.tensor([boxA]),
+                torch.tensor([boxB]),
+            ]
+        else:
+            added_bbox = [
+                torch.tensor([boxB]),
+                torch.tensor([boxA]),
+            ]
+    elif id in [1]:
+        if random.random() < 0.5:
+            added_bbox = [torch.tensor([boxA, boxB])]
+        else:
+            added_bbox = [torch.tensor([boxB, boxA])]
+    return text, added_bbox
+def preprocess_instruct(sample, image_processor, tokenizer, image_embedding_size, prob_ground=1.0, single=False, use_format_v2=False, add_visual_token=False, max_length=None):
+    image_path, dataset, data = sample
+    image = Image.open(image_path)
+    size = image_processor.transforms[0].size
+    image = image.resize((size, size))
+    if dataset == "pisc_relation_split":
+        boxA = data[0]
+        boxB = data[1]
+        relation = data[2]
+        text, added_bbox = generate_pisc_sample(boxA, boxB, relation)
+        # import cv2
+        # boxA *= size
+        # boxB *= size
+        # open_cv_image = np.array(image)
+        # open_cv_image = open_cv_image[:, :, ::-1].copy()
+        # open_cv_image = cv2.rectangle(open_cv_image, boxA[:2].astype(int), boxA[2:].astype(int), (255, 0, 0), 2)
+        # open_cv_image = cv2.rectangle(open_cv_image, boxB[:2].astype(int), boxB[2:].astype(int), (0, 255, 0), 2)
+        # cv2.imwrite("output.jpg", open_cv_image)
+        # import pdb; pdb.set_trace()
+    elif dataset == "vg_relation":
+        boxA = data[0][0]
+        nameA = data[0][1]
+        boxB = data[1][0]
+        nameB = data[1][1]
+        relation = data[2]
+        text, added_bbox = generate_vg_relation_sample(boxA, boxB, nameA, nameB, relation)
+    image = preprocess_image(image, image_processor=image_processor)
+    caption = f"<|#image#|>{tokenizer.pad_token*image_embedding_size}<|#endofimage#|>" + text + tokenizer.eos_token
+    input_ids, attention_mask = preprocess_text(caption, tokenizer, max_length=max_length, single=True)
+    # return image, input_ids, attention_mask, added_bbox
+    images = image.unsqueeze(0)
+    image_start_index_list = [2]
+    return images, len(images), image_start_index_list, input_ids, attention_mask, added_bbox
+def preprocess_caption(sample, image_processor, tokenizer, image_embedding_size, max_length, single=False):
+    image, caption = sample
+    caption = f"<|#image#|>{tokenizer.pad_token*image_embedding_size}<|#endofimage#|>" + caption
+    image = preprocess_image(image, image_processor=image_processor)
+    input_ids, attention_mask = preprocess_text(caption, tokenizer, max_length=max_length, single=single)
+    return image, input_ids, attention_mask
+def get_pile_dataset(args, image_processor, tokenizer, epoch=0, floor=False):
+    input_shards = args.pile_shards
+    assert input_shards is not None
+    resampled = getattr(args, "dataset_resampled", False)
+    assert resampled, "turn on dataset_resampled to allow infinite stream of samples"
+    # create a shared epoch store to sync epoch to dataloader worker proc
+    shared_epoch = SharedEpoch(epoch=epoch)
+    preprocess_text_fn = functools.partial(preprocess_encoded_text, tokenizer=tokenizer, max_length=args.max_length)
+    pipeline = [
+        ResampledShards2(input_shards, deterministic=True, epoch=shared_epoch),
+        tarfile_to_samples_nothrow,
+        wds.shuffle(
+            bufsize=_SAMPLE_SHUFFLE_SIZE,
+            initial=_SAMPLE_SHUFFLE_INITIAL,
+        ),
+        wds.to_tuple("txt", handler=log_and_continue),
+        wds.map_tuple(
+            preprocess_text_fn, handler=log_and_continue
+        ),
+    ]
+    # with_epoch(sys.maxsize) will give us an infinite sample stream
+    dataset = wds.DataPipeline(*pipeline).with_epoch(sys.maxsize)
+    delimiter_id = tokenizer(tokenizer.eos_token, add_special_tokens=False)["input_ids"][-1]
+    dataset = ConcatDataset(iter(dataset), max_length=args.max_length, delimiter_id=delimiter_id)
+    def text_collate_fn(items):
+        try:
+            input_ids = torch.cat([x[0].unsqueeze(0) for x in items], dim=0)
+            attention_mask = torch.cat([x[1].unsqueeze(0) for x in items], dim=0)
+            return input_ids, attention_mask
+        except:
+            return None, None
+    dataloader = wds.WebLoader(
+        dataset,
+        batch_size=args.batch_size_pile,
+        shuffle=False,
+        num_workers=args.workers,
+        persistent_workers=False,
+        collate_fn=text_collate_fn,
+    )
+    return DataInfo(dataloader=dataloader, shared_epoch=shared_epoch)
+# FIXME:
+# modify /gpfs/u/home/LMCG/LMCGljnn/scratch/miniconda3-ppc64le/envs/unified/lib/python3.9/site-packages/webdataset/filters.py, line 433
+# combine_tensors=True to combine_tensors=False
+def get_ground_laion_dataset(args, image_processor, tokenizer, epoch=0, floor=False):
+    input_shards = args.laion_shards
+    assert input_shards is not None
+    resampled = getattr(args, "dataset_resampled", False)
+    assert resampled, "turn on dataset_resampled to allow infinite stream of samples"
+    # create a shared epoch store to sync epoch to dataloader worker proc
+    shared_epoch = SharedEpoch(epoch=epoch)
+    generator = caption_grounder(
+        config_file="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
+        checkpoint_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
+        cpu_only=True,
+        # box_threshold=0.5, text_threshold=0.3,
+    )
+    preprocess_ground_caption_fn = functools.partial(
+        preprocess_ground_caption, image_processor=image_processor, tokenizer=tokenizer,
+        image_embedding_size=args.vis_embed_size, single=args.single, generator=generator,
+        prob_ground=args.prob_ground, use_format_v2=args.use_format_v2,
+        add_visual_token=args.add_visual_token, max_length=args.max_length,
+        args=args,
+    )
+    pipeline = [
+        ResampledShards2(input_shards, deterministic=True, epoch=shared_epoch),
+        tarfile_to_samples_nothrow,
+        wds.shuffle(
+            bufsize=_SAMPLE_SHUFFLE_SIZE,
+            initial=_SAMPLE_SHUFFLE_INITIAL,
+        ),
+        wds.select(filter_no_caption_or_no_image),
+        wds.decode("pilrgb", partial=True, handler=log_and_continue),
+        wds.to_tuple("jpg;png;jpeg", "txt", "logits.pyd", "boxes.pyd", "relation_data", handler=log_and_continue),
+        wds.map(
+            preprocess_ground_caption_fn, handler=log_and_continue
+        ),
+    ]
+    dataset = wds.DataPipeline(*pipeline).with_epoch(sys.maxsize)
+    # for sample in dataset:
+    #     print(tokenizer.decode(sample[1][0]).replace("<PAD>", ""))
+    # DEBUG
+    # dataset = wds.DataPipeline(*pipeline)
+    # from tqdm import tqdm
+    # for sample in tqdm(dataset):
+    #     nn = 0
+    #     for x in sample[1][0]:
+    #         if x == tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1]:
+    #             nn += 1
+    #         if x == tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]:
+    #             nn -= 1
+    #         if nn not in [0, 1]:
+    #             print(tokenizer.decode(sample[1][0]).replace("<PAD>", ""))
+    #             import pdb; pdb.set_trace()
+    #     if nn != 0:
+    #         print(tokenizer.decode(sample[1][0]).replace("<PAD>", ""))
+    #         import pdb; pdb.set_trace()
+    # from groundingdino.demo.inference_on_laion import OBJ_LENGTHS
+    # # import pdb; pdb.set_trace()
+    # print(sum(OBJ_LENGTHS) / len(OBJ_LENGTHS))
+    # exit()
+    # DEBUG
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    delimiter_id = tokenizer(tokenizer.eos_token, add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    box_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    dataset = ConcatDataset(
+        iter(dataset), max_length=args.max_length,
+        delimiter_id=delimiter_id,
+        pad_id=tokenizer.pad_token_id,
+        media_id=media_token_id,
+        endofmedia_id=endofmedia_token_id,
+        box_id=box_id,
+        visual_id=visual_id,
+        image_embedding_size=args.vis_embed_size,
+        single=args.single,
+    )
+    def image_collate_fn(items):
+        images = torch.cat([x[0] for x in items], dim=0)
+        image_nums = [x[1] for x in items]
+        image_start_index_list = [x[2] for x in items]
+        input_ids = torch.cat([x[3].unsqueeze(0) for x in items], dim=0)
+        attention_mask = torch.cat([x[4].unsqueeze(0) for x in items], dim=0)
+        added_bbox_list = [x[5] for x in items]
+        expand_list = added_bbox_list[0]
+        for x in added_bbox_list[1:]:
+            expand_list.extend(x)
+        relations_list = [x[6] for x in items]
+        return images, image_nums, image_start_index_list, input_ids, attention_mask, expand_list, relations_list
+    dataloader = wds.WebLoader(
+        dataset,
+        batch_size=args.batch_size_laion,
+        shuffle=False,
+        num_workers=args.workers,
+        persistent_workers=False,
+        collate_fn=image_collate_fn,
+    )
+    round_fn = math.floor if floor else math.ceil
+    global_batch_size = args.batch_size_laion * args.world_size
+    num_batches = round_fn(LAION2B_NUM_SAMPLE / global_batch_size)
+    dataloader.num_batches = num_batches
+    return DataInfo(dataloader=dataloader, shared_epoch=shared_epoch)
+def get_image_text_pair_dataset(args, image_processor, tokenizer, epoch=0, floor=False):
+    input_shards = args.laion_shards
+    assert input_shards is not None
+    resampled = getattr(args, "dataset_resampled", False)
+    assert resampled, "turn on dataset_resampled to allow infinite stream of samples"
+    # create a shared epoch store to sync epoch to dataloader worker proc
+    shared_epoch = SharedEpoch(epoch=epoch)
+    preprocess_caption_fn = functools.partial(
+        preprocess_caption, image_processor=image_processor, tokenizer=tokenizer,
+        image_embedding_size=args.vis_embed_size, single=args.single,
+        max_length=args.max_length,
+    )
+    pipeline = [
+        ResampledShards2(input_shards, deterministic=True, epoch=shared_epoch),
+        tarfile_to_samples_nothrow,
+        wds.shuffle(
+            bufsize=_SAMPLE_SHUFFLE_SIZE,
+            initial=_SAMPLE_SHUFFLE_INITIAL,
+        ),
+        wds.select(filter_no_caption_or_no_image),
+        wds.decode("pilrgb", handler=log_and_continue),
+        wds.to_tuple("jpg;png;jpeg", "txt", handler=log_and_continue),
+        wds.map(
+            preprocess_caption_fn, handler=log_and_continue
+        ),
+    ]
+    dataset = wds.DataPipeline(*pipeline).with_epoch(sys.maxsize)
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    delimiter_id = tokenizer(tokenizer.eos_token, add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    dataset = ConcatDataset(
+        iter(dataset), max_length=args.max_length,
+        delimiter_id=delimiter_id,
+        pad_id=tokenizer.pad_token_id,
+        media_id=media_token_id,
+        endofmedia_id=endofmedia_token_id,
+        image_embedding_size=args.vis_embed_size,
+        single=args.single,
+    )
+    def image_collate_fn(items):
+        images = torch.cat([x[0] for x in items], dim=0)
+        image_nums = [x[1] for x in items]
+        image_start_index_list = [x[2] for x in items]
+        input_ids = torch.cat([x[3].unsqueeze(0) for x in items], dim=0)
+        attention_mask = torch.cat([x[4].unsqueeze(0) for x in items], dim=0)
+        return images, image_nums, image_start_index_list, input_ids, attention_mask
+    dataloader = wds.WebLoader(
+        dataset,
+        batch_size=args.batch_size_laion,
+        shuffle=False,
+        num_workers=args.workers,
+        persistent_workers=False,
+        collate_fn=image_collate_fn,
+    )
+    round_fn = math.floor if floor else math.ceil
+    global_batch_size = args.batch_size_laion * args.world_size
+    num_batches = round_fn(LAION2B_NUM_SAMPLE / global_batch_size)
+    dataloader.num_batches = num_batches
+    return DataInfo(dataloader=dataloader, shared_epoch=shared_epoch)
+def get_instruct_dataset(args, image_processor, tokenizer, epoch=0, floor=False):
+    input_shards = args.laion_shards
+    assert input_shards is not None
+    resampled = getattr(args, "dataset_resampled", False)
+    assert resampled, "turn on dataset_resampled to allow infinite stream of samples"
+    # create a shared epoch store to sync epoch to dataloader worker proc
+    shared_epoch = SharedEpoch(epoch=epoch)
+    preprocess_instruct_fn = functools.partial(
+        preprocess_instruct, image_processor=image_processor, tokenizer=tokenizer,
+        image_embedding_size=args.vis_embed_size,
+        max_length=args.max_length,
+    )
+    pipeline = [
+        ResampledShards2(input_shards, deterministic=True, epoch=shared_epoch),
+        tarfile_to_samples_nothrow,
+        wds.shuffle(
+            bufsize=_SAMPLE_SHUFFLE_SIZE,
+            initial=_SAMPLE_SHUFFLE_INITIAL,
+        ),
+        wds.decode(partial=True),
+        wds.to_tuple("image_path.txt", "dataset.txt", "data.pyd", handler=log_and_continue),
+        wds.map(
+            preprocess_instruct_fn, handler=log_and_continue
+        ),
+    ]
+    dataset = wds.DataPipeline(*pipeline).with_epoch(sys.maxsize)
+    def image_collate_fn(items):
+        images = torch.cat([x[0] for x in items], dim=0)
+        image_nums = [x[1] for x in items]
+        image_start_index_list = [x[2] for x in items]
+        input_ids = torch.cat([x[3] for x in items], dim=0)
+        attention_mask = torch.cat([x[4] for x in items], dim=0)
+        added_bbox_list = [x[5] for x in items]
+        expand_list = added_bbox_list[0]
+        for x in added_bbox_list[1:]:
+            expand_list.extend(x)
+        return images, image_nums, image_start_index_list, input_ids, attention_mask, expand_list
+    dataloader = wds.WebLoader(
+        dataset,
+        batch_size=args.batch_size_laion,
+        shuffle=False,
+        num_workers=args.workers,
+        persistent_workers=False,
+        collate_fn=image_collate_fn,
+    )
+    round_fn = math.floor if floor else math.ceil
+    global_batch_size = args.batch_size_laion * args.world_size
+    num_batches = round_fn(LAION2B_NUM_SAMPLE / global_batch_size)
+    dataloader.num_batches = num_batches
+    return DataInfo(dataloader=dataloader, shared_epoch=shared_epoch)
+def get_dataset_fn(dataset_type):
+    if dataset_type == "mmc4":
+        raise NotImplementedError
+    elif dataset_type == "pile":
+        return get_pile_dataset
+    elif dataset_type == "ground_image_text":
+        return get_ground_laion_dataset
+    elif dataset_type == "image_text":
+        return get_image_text_pair_dataset
+    elif dataset_type == "vqav2":
+        raise NotImplementedError
+    elif dataset_type == "instruct":
+        return get_instruct_dataset
+    else:
+        raise ValueError(f"Unsupported dataset type: {dataset_type}")
+def get_data(args, image_processor, tokenizer, dataset_type, epoch=0):
+    return get_dataset_fn(dataset_type)(
+        args, image_processor=image_processor, epoch=epoch, tokenizer=tokenizer
+    )

multimodal/build/lib/open_flamingo/train/distributed.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+import torch
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+def is_global_master(args):
+    return args.rank == 0
+def is_local_master(args):
+    return args.local_rank == 0
+def is_master(args, local=False):
+    return is_local_master(args) if local else is_global_master(args)
+def is_using_horovod():
+    # NOTE w/ horovod run, OMPI vars should be set, but w/ SLURM PMI vars will be set
+    # Differentiating between horovod and DDP use via SLURM may not be possible, so horovod arg still required...
+    ompi_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"]
+    pmi_vars = ["PMI_RANK", "PMI_SIZE"]
+    if all([var in os.environ for var in ompi_vars]) or all(
+        [var in os.environ for var in pmi_vars]
+    ):
+        return True
+    else:
+        return False
+def is_using_distributed():
+    if "WORLD_SIZE" in os.environ:
+        return int(os.environ["WORLD_SIZE"]) > 1
+    if "SLURM_NTASKS" in os.environ:
+        return int(os.environ["SLURM_NTASKS"]) > 1
+    return False
+def world_info_from_env():
+    local_rank = 0
+    for v in (
+        "LOCAL_RANK",
+        "MPI_LOCALRANKID",
+        "SLURM_LOCALID",
+        "OMPI_COMM_WORLD_LOCAL_RANK",
+    ):
+        if v in os.environ:
+            local_rank = int(os.environ[v])
+            break
+    global_rank = 0
+    for v in ("RANK", "PMI_RANK", "SLURM_PROCID", "OMPI_COMM_WORLD_RANK"):
+        if v in os.environ:
+            global_rank = int(os.environ[v])
+            break
+    world_size = 1
+    for v in ("WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS", "OMPI_COMM_WORLD_SIZE"):
+        if v in os.environ:
+            world_size = int(os.environ[v])
+            break
+    return local_rank, global_rank, world_size
+def init_distributed_device(args):
+    # Distributed training = training on more than one GPU.
+    # Works in both single and multi-node scenarios.
+    args.distributed = False
+    args.world_size = 1
+    args.rank = 0  # global rank
+    args.local_rank = 0
+    if args.horovod:
+        assert hvd is not None, "Horovod is not installed"
+        hvd.init()
+        args.local_rank = int(hvd.local_rank())
+        args.rank = hvd.rank()
+        args.world_size = hvd.size()
+        args.distributed = True
+        os.environ["LOCAL_RANK"] = str(args.local_rank)
+        os.environ["RANK"] = str(args.rank)
+        os.environ["WORLD_SIZE"] = str(args.world_size)
+    elif is_using_distributed():
+        if "SLURM_PROCID" in os.environ:
+            # DDP via SLURM
+            args.local_rank, args.rank, args.world_size = world_info_from_env()
+            # SLURM var -> torch.distributed vars in case needed
+            os.environ["LOCAL_RANK"] = str(args.local_rank)
+            os.environ["RANK"] = str(args.rank)
+            os.environ["WORLD_SIZE"] = str(args.world_size)
+            torch.distributed.init_process_group(
+                backend=args.dist_backend,
+                init_method=args.dist_url,
+                world_size=args.world_size,
+                rank=args.rank,
+            )
+        else:
+            # DDP via torchrun, torch.distributed.launch
+            args.local_rank, _, _ = world_info_from_env()
+            torch.distributed.init_process_group(
+                backend=args.dist_backend, init_method=args.dist_url
+            )
+            args.world_size = torch.distributed.get_world_size()
+            args.rank = torch.distributed.get_rank()
+        args.distributed = True
+    else:
+        # needed to run on single gpu
+        torch.distributed.init_process_group(
+            backend=args.dist_backend,
+            init_method=args.dist_url,
+            world_size=1,
+            rank=0,
+        )
+    if torch.cuda.is_available():
+        if args.distributed and not args.no_set_device_rank:
+            device = "cuda:%d" % args.local_rank
+        else:
+            device = "cuda:0"
+        torch.cuda.set_device(device)
+    else:
+        device = "cpu"
+    args.device = device
+    device = torch.device(device)
+    return device

multimodal/build/lib/open_flamingo/train/instruction_template.py ADDED Viewed

	@@ -0,0 +1,13 @@

+VG_RELATION_TEMPLATES = [
+    "Question: What is the relationship between<|#object#|> {nameA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> and<|#object#|> {nameB}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>? Answer: {relation}.",
+    "Question: What is the relationship between<|#object#|> {nameA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> and<|#object#|> {nameB}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>? Answer:<|#object#|> {nameA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> {use_is} {relation}<|#object#|> {nameB}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>.",
+    "Question: What {is_or_does}<|#object#|> {nameA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> {relation_do}? Answer:<|#object#|> {nameA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> {use_is} {relation}<|#object#|>{nameB}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>.",
+    "Question: What {use_is} {relation}<|#object#|> {nameB}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>? Answer:<|#object#|> {nameA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> {use_is} {relation}<|#object#|> {nameB}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>.",
+    "Question: What {is_or_does}<|#object#|> {nameA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> {relation_do}? Answer:<|#object#|> {nameB}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>.",
+    "Question: What {use_is} {relation}<|#object#|> {nameB}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>? Answer:<|#object#|> {nameA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>.",
+]
+PISC_TEMPLATES = [
+    "Question: What is the social relationship between this<|#object#|> person<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> and that<|#object#|> person<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|>? Answer: {relation}.",
+    "Question: What is the social relationship between these<|#object#|> people<|#endofobject#|><|#visual#|><|#box#|><|#box#|><|#endofobject#|>? Answer: {relation}.",
+]

multimodal/build/lib/open_flamingo/train/train.py ADDED Viewed

	@@ -0,0 +1,709 @@

+""" Main training script """
+import argparse
+import copy
+import glob
+import os
+import random
+import functools
+import numpy as np
+import torch
+# torch.multiprocessing.set_sharing_strategy('file_system')
+import wandb
+from data2 import get_data
+from distributed import init_distributed_device, world_info_from_env
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+    MixedPrecision,
+    BackwardPrefetch,
+    ShardingStrategy,
+    FullStateDictConfig,
+    CPUOffload,
+    StateDictType,
+)
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.distributed.fsdp.wrap import (
+    transformer_auto_wrap_policy,
+    enable_wrap,
+    wrap,
+)
+from train_utils import train_one_epoch
+from transformers import (
+    get_constant_schedule_with_warmup,
+    get_cosine_schedule_with_warmup,
+    get_linear_schedule_with_warmup,
+)
+from open_flamingo import create_model_and_transforms
+from torch.utils.tensorboard import SummaryWriter
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.cuda.amp import GradScaler
+from torch.distributed.optim import ZeroRedundancyOptimizer
+import warnings
+warnings.filterwarnings("ignore")
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s %(message)s',
+    datefmt='%m/%d %I:%M:%S',
+)
+class FakeDataloader:
+    def __iter__(self):
+        return self
+    def __next__(self):
+        return None
+def random_seed(seed=42, rank=0):
+    torch.manual_seed(seed + rank)
+    np.random.seed(seed + rank)
+    random.seed(seed + rank)
+def get_grouped_params(model, args):
+    params_with_wd, params_without_wd = [], []
+    def apply_decay(x):
+        x = x.lower()
+        return "norm" not in x and "bn" not in x and "bias" not in x and "embed" not in x and "wte" not in x and "flat_param" not in x
+    for n, p in model.named_parameters():
+        # if p.requires_grad:
+        if apply_decay(n):
+            if torch.distributed.get_rank() == 0:
+                logging.info(f"with wd: {n}")
+            params_with_wd.append(p)
+        else:
+            if torch.distributed.get_rank() == 0:
+                logging.info(f"without wd: {n}")
+            params_without_wd.append(p)
+    return [
+        {"params": params_with_wd, "weight_decay": args.weight_decay},
+        {"params": params_without_wd, "weight_decay": 0.0},
+    ]
+def lambda_policy_fn(module):
+    if (
+        len(list(module.named_children())) == 0
+        and getattr(module, "weight", None) is not None
+        and module.weight.requires_grad
+    ):
+        return True
+    return False
+def lambda_auto_wrap_policy(
+    module: torch.nn.Module, recurse: bool, nonwrapped_numel: int, lambda_fn,
+) -> bool:
+    """
+    A convenient auto wrap policy to wrap submodules based on an arbitrary user
+    function. If `lambda_fn(submodule) == True``, the submodule will be wrapped as
+    a `wrapper_cls` unit.
+    Return if a module should be wrapped during auto wrapping.
+    The first three parameters are required by :func:`_recursive_wrap`.
+    Args:
+        module (nn.Module): Current module being considered.
+        recurse (bool): If ``False``, then this function must decide whether
+            ``module`` should be wrapped as an FSDP instance or not. If
+            ``True``, then the function is still recursing down the module
+            tree as a part of the DFS.
+        nonwrapped_numel (int): Parameter numel not yet wrapped.
+        lambda_fn (Callable[[nn.Module], bool]): If this returns ``True``, then
+            this module will be wrapped.
+    """
+    if recurse:
+        return True  # always recurse
+    return lambda_fn(module)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vision_encoder_path", default="ViT-B-16", type=str)
+    parser.add_argument("--vision_encoder_pretrained", default="laion2b_s34b_b88k", type=str)
+    parser.add_argument("--lm_path", default="facebook/opt-1.3b", type=str)
+    parser.add_argument(
+        "--tokenizer_path",
+        default="facebook/opt-1.3b",
+        type=str,
+        help="path to tokenizer",
+    )
+    parser.add_argument(
+        "--run_name",
+        type=str,
+        default="openflamingo3B",
+        help="used to name saving directory and wandb run",
+    )
+    parser.add_argument("--use_media_placement_augmentation", action="store_true")
+    parser.add_argument("--offline", action="store_true")
+    parser.add_argument("--num_steps", type=int, default=300000)
+    parser.add_argument(
+        "--logging_steps", type=int, default=10, help="log loss every n steps"
+    )
+    # Sum of gradient optimization batch size
+    parser.add_argument("--batch_size_mmc4", type=int, default=128)
+    parser.add_argument("--batch_size_laion", type=int, default=128)
+    parser.add_argument("--batch_size_pile", type=int, default=128)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        help="path to checkpoint to resume from, this should contain model, optimizer, and lr_scheduler states",
+        default=None,
+    )
+    parser.add_argument(
+        "--delete_previous_checkpoint",
+        action="store_true",
+        help="delete previous checkpoint when saving new checkpoint",
+    )
+    parser.add_argument(
+        "--laion_shards",
+        type=str,
+        help="path to laion shards, this should be a glob pattern such as /path/to/shards/shard-{0000..0999}.tar",
+    )
+    parser.add_argument(
+        "--mmc4_shards",
+        type=str,
+        help="path to c4 shards, this should be a glob pattern such as /path/to/shards/shard-{0000..0999}.tar",
+    )
+    parser.add_argument(
+        "--pile_shards",
+        type=str,
+        default=None,
+        help="path to pile shards, this should be a glob pattern such as /path/to/shards/shard-{0000..0999}.tar",
+    )
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--learning_rate", default=1e-4, type=float)
+    parser.add_argument(
+        "--lr_scheduler",
+        default="constant",
+        type=str,
+        help="constant, linear, or cosine",
+    )
+    parser.add_argument("--loss_multiplier_mmc4", type=float, default=1.0)
+    parser.add_argument("--loss_multiplier_laion", type=float, default=1.0)
+    parser.add_argument("--loss_multiplier_pile", type=float, default=1.0)
+    parser.add_argument("--loss_multiplier_det", type=float, default=1.0)
+    parser.add_argument("--loss_multiplier_rel", type=float, default=1.0)
+    parser.add_argument("--loss_multiplier_attn", type=float, default=1.0)
+    parser.add_argument("--warmup_steps", default=5000, type=int)
+    # weight decay is only apply to YOLOX head if using FSDP
+    # https://medium.com/@huanghaian123/optimize-and-accelerate-yolox-with-rtmdet-hyps-in-mmyolo-80fc06d61159
+    parser.add_argument("--weight_decay", default=0.05, type=float)
+    parser.add_argument(
+        "--precision",
+        choices=["amp_fp16", "amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"],
+        default="fp32",
+        help="Floating point precision.",
+    )
+    # data args
+    parser.add_argument("--workers", type=int, default=1)
+    parser.add_argument("--dataset_resampled", action="store_true")
+    # distributed training args
+    parser.add_argument(
+        "--dist-url",
+        default="env://",
+        type=str,
+        help="url used to set up distributed training",
+    )
+    parser.add_argument(
+        "--dist-backend", default="nccl", type=str, help="distributed backend"
+    )
+    parser.add_argument(
+        "--horovod",
+        default=False,
+        action="store_true",
+        help="Use horovod for distributed training.",
+    )
+    parser.add_argument(
+        "--no-set-device-rank",
+        default=False,
+        action="store_true",
+        help="Don't set device index from local rank (when CUDA_VISIBLE_DEVICES restricted to one per proc).",
+    )
+    # wandb args
+    parser.add_argument("--report_to_wandb", default=False, action="store_true")
+    parser.add_argument(
+        "--wandb_project",
+        type=str,
+    )
+    parser.add_argument(
+        "--wandb_entity",
+        type=str,
+    )
+    parser.add_argument(
+        "--save_checkpoints_to_wandb",
+        default=False,
+        action="store_true",
+        help="save checkpoints to wandb",
+    )
+    parser.add_argument(
+        "--checkpoint_activations",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--freeze_vision_encoder",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--mmc4_textsim_threshold",
+        default=30,
+        type=float,
+        help="threshold for filtering images in mmc4 based on image-text similarity",
+    )
+    parser.add_argument(
+        "--location_token_num",
+        default=1000,
+        type=int,
+    )
+    parser.add_argument(
+        "--vis_embed_size",
+        type=int,
+        required=False,
+    )
+    parser.add_argument(
+        "--save_interval",
+        default=1000,
+        type=int,
+        required=False,
+    )
+    parser.add_argument(
+        "--skip_delete_pattern",
+        default=1500,
+        type=int,
+        required=False,
+    )
+    parser.add_argument(
+        "--ddp",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--pile_freq",
+        default=1,
+        type=int,
+        required=False,
+    )
+    parser.add_argument(
+        "--restart",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--lora",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--lora_r",
+        default=16,
+        type=int,
+        required=False,
+    )
+    parser.add_argument(
+        "--single",
+        default=False,
+        action="store_true",
+    )
+    # Finetune
+    parser.add_argument(
+        "--instruct",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--fix-ffn",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--prob_ground",
+        default=1.0,
+        type=float,
+        required=False,
+    )
+    parser.add_argument(
+        "--optimizer",
+        default="adamw",
+        type=str,
+        required=False,
+    )
+    parser.add_argument(
+        "--add_visual_token",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--use_format_v2",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--use_sam",
+        default=None,
+        type=str,
+        required=False,
+    )
+    parser.add_argument(
+        "--max-length",
+        default=608,
+        type=int,
+        required=False,
+    )
+    parser.add_argument(
+        "--image-size",
+        default=256,
+        type=int,
+        required=False,
+    )
+    parser.add_argument(
+        "--reset_llm",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--add_box",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--add_pe",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--only_grounded_sample",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--expand",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--delete_contained",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--relation",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--attn_reg",
+        default="l1",
+        type=str,
+        required=False,
+    )
+    parser.add_argument(
+        "--enhance_data",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--no_visual",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--no_previsual",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--roi_align",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--roi_output_size",
+        default=4,
+        type=int,
+        required=False,
+    )
+    parser.add_argument(
+        "--apply_mask",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--longer_previsual",
+        default=False,
+        action="store_true",
+    )
+    args = parser.parse_args()
+    assert not args.use_media_placement_augmentation, "Do not enable use_media_placement_augmentation"
+    if args.no_previsual:
+        assert args.no_visual, "no_previsual MUST come with no_visual"
+    assert not args.enhance_data, "dont enable enhance_data"
+    if args.offline:
+        os.environ["WANDB_MODE"] = "offline"
+        os.environ["TRANSFORMERS_OFFLINE"] = "1"
+    args.local_rank, args.rank, args.world_size = world_info_from_env()
+    print(f"local_rank: {args.local_rank} rank: {args.rank} world_size: {args.world_size}")
+    device_id = init_distributed_device(args)
+    random_seed(args.seed)
+    model, image_processor, tokenizer, args.vis_embed_size = create_model_and_transforms(
+        args.vision_encoder_path,
+        args.vision_encoder_pretrained,
+        args.lm_path,
+        args.tokenizer_path if args.tokenizer_path else args.lm_path,
+        use_local_files=args.offline,
+        use_media_placement_augmentation=args.use_media_placement_augmentation,
+        checkpoint_activations=args.checkpoint_activations,
+        freeze_vision_encoder=args.freeze_vision_encoder,
+        location_token_num=args.location_token_num,
+        lora=args.lora,
+        lora_r=args.lora_r,
+        fix_ffn=args.fix_ffn,
+        add_visual_token=args.add_visual_token,
+        add_box=args.add_box,
+        add_pe=args.add_pe,
+        add_relation=args.relation,
+        use_format_v2=args.use_format_v2,
+        use_sam=args.use_sam,
+        enhance_data=args.enhance_data,
+        roi_align=args.roi_align,
+        roi_output_size=args.roi_output_size,
+        apply_mask=args.apply_mask,
+    )
+    if args.reset_llm:
+        llm_state_dict = model.lang_encoder.state_dict()
+    if args.rank == 0:
+        print(args)
+        print(image_processor)
+    random_seed(args.seed, args.rank)
+    if args.rank == 0 and args.report_to_wandb:
+        wandb.init(
+            project=args.wandb_project,
+            entity=args.wandb_entity,
+            name=args.run_name,
+            config=vars(args),
+        )
+    device_id = args.rank % torch.cuda.device_count()
+    if args.ddp:
+        print("use ddp mode")
+        model = model.to(device_id)
+        model = DDP(model)
+    else:
+        fpSixteen = MixedPrecision(
+            param_dtype=torch.float16,
+            # Gradient communication precision.
+            reduce_dtype=torch.float16,
+            # Buffer precision.
+            # buffer_dtype=torch.float16,
+        )
+        # from transformers.models.opt.modeling_opt import OPTDecoderLayer
+        from open_clip.transformer import ResidualAttentionBlock
+        from open_flamingo.src.flamingo_lm import FlamingoLayer
+        from transformers.models.opt.modeling_opt import OPTDecoderLayer, OPTAttention
+        from segment_anything.modeling.image_encoder import Block
+        transformer_layer_cls=[
+            FlamingoLayer,
+            ResidualAttentionBlock,
+            Block,
+        ]
+        if args.fix_ffn:
+            transformer_layer_cls.append(OPTAttention)
+        auto_wrap_policy = functools.partial(
+            transformer_auto_wrap_policy,
+            transformer_layer_cls=transformer_layer_cls,
+        )
+        if args.lora:
+            from torch.distributed.fsdp.wrap import _or_policy
+            lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn)
+            auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, auto_wrap_policy])
+            ignored_modules = [model.vision_encoder]
+            # ignored_modules = None
+        else:
+            ignored_modules = [model.detection_head]
+            # ignored_modules = None
+        if args.add_pe:
+            ignored_modules += [model.pos_enc]
+        # if args.use_format_v2:
+        #     ignored_modules += [model.lang_encoder.visual_guided_lm_head]
+        model = FSDP(
+            model,
+            auto_wrap_policy=auto_wrap_policy,
+            mixed_precision=fpSixteen,
+            device_id=torch.cuda.current_device(),
+            ignored_modules=ignored_modules,
+            sharding_strategy=ShardingStrategy.SHARD_GRAD_OP,
+        )
+        model = model.to(device_id)
+    pile_dataset = None
+    if args.instruct:
+        laion_dataset = get_data(args, image_processor, tokenizer, "instruct")
+    else:
+        laion_dataset = get_data(args, image_processor, tokenizer, "ground_image_text")
+    if args.pile_shards is not None:
+        pile_dataset = get_data(args, image_processor, tokenizer, "pile")
+    optim_groups = get_grouped_params(model, args)
+    # optimizer = torch.optim.AdamW(optim_groups, lr=args.learning_rate)
+    if args.ddp:
+        optimizer = torch.optim.AdamW(optim_groups, lr=args.learning_rate)
+        # optimizer = ZeroRedundancyOptimizer(
+        #     optim_groups,
+        #     optimizer_class=torch.optim.AdamW,
+        #     lr=args.learning_rate,
+        #     parameters_as_bucket_view=True,
+        # )
+    else:
+        if args.optimizer == "adamw":
+            print("use adamw")
+            optimizer = torch.optim.AdamW(optim_groups, lr=args.learning_rate)
+        elif args.optimizer == "sgd":
+            print("use sgd...")
+            optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate)
+        else:
+            raise NotImplementedError
+    total_training_steps = args.num_steps
+    if args.rank == 0:
+        logging.info(f"Total training steps: {total_training_steps}")
+    if args.lr_scheduler == "linear":
+        lr_scheduler = get_linear_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=args.warmup_steps,
+            num_training_steps=total_training_steps,
+        )
+    elif args.lr_scheduler == "cosine":
+        lr_scheduler = get_cosine_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=args.warmup_steps,
+            num_training_steps=total_training_steps,
+        )
+    else:
+        lr_scheduler = get_constant_schedule_with_warmup(
+            optimizer, num_warmup_steps=args.warmup_steps
+        )
+    if args.ddp:
+        scaler = GradScaler()
+    else:
+        scaler = ShardedGradScaler()
+    total_laion_token = 0
+    total_pile_token = 0
+    total_laion_sample = 0
+    total_step = 0
+    # check if a checkpoint exists for this run
+    if os.path.exists(f"{args.run_name}"):
+        checkpoint_list = glob.glob(f"{args.run_name}/checkpoint_*.pt")
+        if len(checkpoint_list) == 0:
+            if args.rank == 0:
+                logging.info(f"Found no checkpoints for run {args.run_name}.")
+        else:
+            args.resume_from_checkpoint = sorted(
+                checkpoint_list, key=lambda x: int(x.split("_")[-1].split(".")[0])
+            )[-1]
+            if args.rank == 0:
+                logging.info(f"Found checkpoint {args.resume_from_checkpoint} for run {args.run_name}.")
+            args.restart = False
+            if args.rank == 0:
+                logging.info("do not restart because an existed checkpoint is found")
+    if args.resume_from_checkpoint is not None:
+        if args.rank == 0:
+            logging.info(f"Loading checkpoint from {args.resume_from_checkpoint}")
+        checkpoint = torch.load(args.resume_from_checkpoint, map_location="cpu")
+        torch.distributed.barrier()
+        if args.ddp:
+            model.module.load_state_dict(checkpoint["model_state_dict"], strict=False)
+            # sharded_osd = checkpoint['optimizer_state_dict']
+        else:
+            with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT):
+                if args.reset_llm:
+                    for key in checkpoint["model_state_dict"]:
+                        if key.startswith("lang_encoder"):
+                            if args.rank == 0:
+                                logging.info(f"reset {key}")
+                            llm_key = key.replace("lang_encoder.", "")
+                            checkpoint["model_state_dict"][key] = llm_state_dict[llm_key]
+                model_state_dict = model.state_dict()
+                for key in checkpoint["model_state_dict"].keys():
+                    if model_state_dict[key].shape != checkpoint["model_state_dict"][key].shape:
+                        if args.rank == 0:
+                            logging.info(f'{key}: shape mismatched! {model_state_dict[key].shape} vs {checkpoint["model_state_dict"][key].shape}')
+                        checkpoint["model_state_dict"][key] = model_state_dict[key].clone()
+                del model_state_dict
+                model.load_state_dict(checkpoint["model_state_dict"], False)
+            # sharded_osd = FSDP.shard_full_optim_state_dict(checkpoint['optimizer_state_dict'], model, optim_input=optim_groups)
+        if not args.restart:
+            # optimizer.load_state_dict(sharded_osd)
+            lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"])
+            # scaler.load_state_dict(checkpoint["scaler_state_dict"])
+            total_laion_token = checkpoint.get("total_laion_token", 0)
+            total_pile_token = checkpoint.get("total_pile_token", 0)
+            total_laion_sample = checkpoint.get("total_laion_sample", 0)
+            total_step = checkpoint.get("total_step", 0)
+            if args.rank == 0:
+                logging.info("load training statistics...")
+        else:
+            if args.rank == 0:
+                logging.info("restart training / finetuning. only load model weight...")
+        del checkpoint
+        if args.reset_llm:
+            del llm_state_dict
+        torch.cuda.empty_cache()
+        torch.distributed.barrier()
+    model.train()
+    if args.rank == 0:
+        if not os.path.exists(args.run_name):
+            os.makedirs(args.run_name)
+        writer = SummaryWriter(log_dir=os.path.join(args.run_name, "tblog"))
+    else:
+        writer = None
+    laion_dataset.set_epoch(total_step)
+    laion_loader = laion_dataset.dataloader
+    if pile_dataset is not None:
+        pile_dataset.set_epoch(total_step)
+        pile_loader = pile_dataset.dataloader
+    else:
+        pile_loader = FakeDataloader()
+    train_one_epoch(
+        args=args,
+        model=model,
+        tokenizer=tokenizer,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+        laion_loader=laion_loader,
+        pile_loader=pile_loader,
+        device_id=device_id,
+        writer=writer,
+        scaler=scaler,
+        optim_groups=optim_groups,
+        total_laion_token=total_laion_token,
+        total_pile_token=total_pile_token,
+        total_laion_sample=total_laion_sample,
+        total_step=total_step,
+    )
+if __name__ == "__main__":
+    main()

multimodal/build/lib/open_flamingo/train/train_utils.py ADDED Viewed

	@@ -0,0 +1,387 @@

+import time
+from contextlib import suppress
+import numpy as np
+import torch
+from tqdm import tqdm
+import datetime
+import os
+import gc
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+    MixedPrecision,
+    BackwardPrefetch,
+    ShardingStrategy,
+    FullStateDictConfig,
+    StateDictType,
+)
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.distributed.fsdp.wrap import (
+    transformer_auto_wrap_policy,
+    enable_wrap,
+    wrap,
+)
+from torch.utils.tensorboard import SummaryWriter
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s %(message)s',
+    datefmt='%m/%d %I:%M:%S',
+)
+def get_cast_dtype(precision: str):
+    cast_dtype = None
+    if precision == "bf16":
+        cast_dtype = torch.bfloat16
+    elif precision == "fp16":
+        cast_dtype = torch.float16
+    return cast_dtype
+def get_autocast(precision):
+    if precision == "amp_fp16":
+        return lambda: torch.cuda.amp.autocast(dtype=torch.float16)
+    elif precision == "amp_bfloat16" or precision == "amp_bf16":
+        # amp_bfloat16 is more stable than amp float16 for clip training
+        return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16)
+    else:
+        return suppress
+def get_sync(model, flag):
+    if flag:
+        return suppress
+    else:
+        return lambda: model.no_sync()
+def train_one_epoch(
+    args,
+    model,
+    laion_loader,
+    pile_loader,
+    tokenizer,
+    optimizer,
+    lr_scheduler,
+    device_id,
+    writer: SummaryWriter,
+    optim_groups,
+    scaler,
+    total_laion_token: int,
+    total_pile_token: int,
+    total_laion_sample: int,
+    total_step: int,
+):
+    world_size = torch.distributed.get_world_size()
+    autocast = get_autocast(args.precision)
+    cast_dtype = get_cast_dtype(args.precision)
+    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
+    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
+    visual_token_id = tokenizer("<|#visual#|>", add_special_tokens=False)["input_ids"][-1]
+    if args.add_box:
+        box_token_id = tokenizer("<|#box#|>", add_special_tokens=False)["input_ids"][-1]
+        endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
+        endofattr_token_id = tokenizer("<|#endofattr#|>", add_special_tokens=False)["input_ids"][-1]
+    if args.use_format_v2:
+        prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
+        previsual_token_id = tokenizer("<|#previsual#|>", add_special_tokens=False)["input_ids"][-1]
+    if args.rank == 0:
+        logging.info(f"train from: {total_step} step")
+    model.train()
+    # loop through dataloader
+    last_logging_step = total_step
+    last_save_step = total_step
+    for num_steps, (batch_laion, batch_pile) in tqdm(
+        enumerate(zip(laion_loader, pile_loader)),
+        disable=args.rank != 0 or "SLURM_PROCID" in os.environ,
+        total=args.num_steps * args.gradient_accumulation_steps,
+        initial=total_step * args.gradient_accumulation_steps,
+    ):
+        #### LAION FORWARD PASS ####
+        images = (
+            batch_laion[0]
+            .to(device_id, dtype=cast_dtype, non_blocking=True)
+            .unsqueeze(1)
+            .unsqueeze(1)
+        )
+        image_nums = batch_laion[1]
+        image_start_index_list = batch_laion[2]
+        # TODO: OPT model: input_ids is not started with </s> while input_ids2 is?
+        input_ids = batch_laion[3].to(device_id, non_blocking=True).long()
+        attention_mask = batch_laion[4].to(device_id, dtype=cast_dtype, non_blocking=True)
+        added_bbox_list = [x.to(device_id) for x in batch_laion[5]] # list object
+        total_laion_token += int(attention_mask.sum().long()) * world_size
+        total_laion_sample += sum(image_nums) * world_size
+        labels = input_ids.clone()
+        if args.add_box:
+            labels[input_ids == visual_token_id] = -100
+            labels[input_ids == box_token_id] = -100
+            labels[input_ids == endofattr_token_id] = -100
+            if args.use_format_v2:
+                labels[input_ids == previsual_token_id] = -100
+                labels[input_ids == prebox_token_id] = -100
+                labels[torch.roll(input_ids == prebox_token_id, 1)] = -100
+                labels[torch.roll(input_ids == box_token_id, 1)] = -100
+        labels[:, 0] = -100
+        labels[input_ids == tokenizer.pad_token_id] = -100
+        labels[input_ids == media_token_id] = -100
+        labels[input_ids == endofmedia_token_id] = -100
+        labels.to(device_id)
+        current_laion_num = input_ids.shape[0]
+        #### PILE FORWARD PASS ####
+        if batch_pile is not None and batch_pile[0] is not None and batch_pile[1] is not None:
+            input_ids2 = batch_pile[0].to(device_id, non_blocking=True).long()
+            attention_mask2 = batch_pile[1].to(device_id, dtype=cast_dtype, non_blocking=True)
+            input_length = input_ids.shape[-1]
+            input_ids2 = torch.cat([input_ids2, torch.ones((input_ids2.shape[0], input_length - input_ids2.shape[1]), device=input_ids2.device, dtype=input_ids2.dtype) * tokenizer.pad_token_id], dim=-1)
+            attention_mask2 = torch.cat([attention_mask2, torch.zeros((attention_mask2.shape[0], input_length - attention_mask2.shape[1]), device=attention_mask2.device, dtype=attention_mask2.dtype)], dim=-1)
+            labels2 = input_ids2.clone()
+            labels2[labels2 == tokenizer.pad_token_id] = -100
+            labels2[:, 0] = -100
+            labels2.to(device_id)
+            if (num_steps != 0 and num_steps % args.pile_freq == 0) or args.pile_freq == 1:
+                image_nums = image_nums + [0] * len(input_ids2)
+                image_start_index_list = image_start_index_list + [[]] * len(input_ids2)
+                input_ids = torch.cat([input_ids, input_ids2], dim=0)
+                attention_mask = torch.cat([attention_mask, attention_mask2], dim=0)
+                labels = torch.cat([labels, labels2], dim=0)
+                total_pile_token += int(attention_mask2.sum().long()) * world_size
+            else:
+                del input_ids2
+                del attention_mask2
+                del labels2
+        if args.instruct:
+            answer_token_id = tokenizer(" Answer").input_ids[0]
+            answer_token_loc = (input_ids == answer_token_id).nonzero()
+            for batch_idx, idx in answer_token_loc:
+                labels[batch_idx][:idx+2] = -100
+        if args.relation and not args.instruct:
+            relations = batch_laion[6]
+        else:
+            relations = None
+        if len(added_bbox_list) == 0:
+            added_bbox_list = None
+        update_flag = (num_steps != 0 and num_steps % args.gradient_accumulation_steps == 0) or args.gradient_accumulation_steps == 1
+        # do_sync = get_sync(model, update_flag)
+        with autocast():
+            # modify:
+            #   /gpfs/u/home/LMCG/LMCGljnn/scratch/miniconda3-ppc64le/envs/unified/lib/python3.9/site-packages/transformers/models/codegen/modeling_codegen.py
+            #   /gpfs/u/home/LMCG/LMCGljnn/scratch/miniconda3-ppc64le/envs/unified/lib/python3.9/site-packages/transformers/models/opt/modeling_opt.py
+            # CrossEntropyLoss(reduction="none")
+            outputs = model(
+                vision_x=images,
+                lang_x=input_ids,
+                attention_mask=attention_mask,
+                labels=labels,
+                image_nums=image_nums,
+                image_start_index_list=image_start_index_list,
+                added_bbox_list=added_bbox_list,
+                add_box=args.add_box,
+                relations=relations,
+            )
+            loss_total = outputs.loss.reshape(labels.shape[0], -1)
+            loss_sample = loss_total.sum(-1) / (loss_total != 0).sum(-1)
+            loss_sample_for_laion = loss_sample[:current_laion_num]
+            nan_mask = torch.isnan(loss_sample_for_laion)
+            if nan_mask.sum() > 0:
+                logging.warning(f"caption NaN: {nan_mask}")
+            if nan_mask.sum() == len(loss_sample_for_laion) or not model.valid:
+                logging.info("WARNING: skip this caption loss due to some error")
+                loss_laion = torch.tensor(0.0).cuda()
+            else:
+                loss_laion = loss_sample_for_laion[~nan_mask].mean()
+            loss_caption = loss_laion
+            divided_loss_laion = loss_laion / args.gradient_accumulation_steps
+            if current_laion_num != loss_sample.shape[0]:
+                loss_pile = loss_sample[current_laion_num:].mean()
+            else:
+                loss_pile = torch.tensor(0.0).cuda()
+            divided_loss_pile = loss_pile / args.gradient_accumulation_steps
+            if "detection_losses" in outputs:
+                loss_det = outputs["detection_losses"]["loss"]
+                loss_iou = outputs["detection_losses"]["loss_iou"]
+                loss_obj = outputs["detection_losses"]["loss_obj"]
+                loss_cls = outputs["detection_losses"]["loss_cls"]
+            else:
+                loss_det = torch.tensor(0.0).cuda()
+                loss_iou = torch.tensor(0.0).cuda()
+                loss_obj = torch.tensor(0.0).cuda()
+                loss_cls = torch.tensor(0.0).cuda()
+            if "loss_dict" in outputs:
+                visual_loss_iou = outputs["loss_dict"][0]["loss_iou"]
+                previsual_loss_iou = outputs["loss_dict"][1]["loss_iou"]
+                visual_loss_obj = outputs["loss_dict"][0]["loss_obj"]
+                previsual_loss_obj = outputs["loss_dict"][1]["loss_obj"]
+            else:
+                visual_loss_iou = torch.tensor(0.0).cuda()
+                previsual_loss_iou = torch.tensor(0.0).cuda()
+                visual_loss_obj = torch.tensor(0.0).cuda()
+                previsual_loss_obj = torch.tensor(0.0).cuda()
+            divided_loss_det = loss_det / args.gradient_accumulation_steps
+            loss_rel = outputs.get("rel_loss", torch.tensor(0.0).cuda())
+            divided_loss_rel = loss_rel / args.gradient_accumulation_steps
+            loss = (
+                divided_loss_laion * args.loss_multiplier_laion +
+                divided_loss_pile * args.loss_multiplier_pile +
+                divided_loss_det * args.loss_multiplier_det +
+                divided_loss_rel * args.loss_multiplier_rel
+            )
+        scaler.scale(loss).backward()
+        # for logging only
+        loss = (
+            loss_laion * args.loss_multiplier_laion
+            + loss_pile * args.loss_multiplier_pile
+            + loss_det * args.loss_multiplier_det
+            + loss_rel * args.loss_multiplier_rel
+        ).detach()
+        # step optimizer and log
+        if update_flag:
+            #### MASK GRADIENTS FOR EMBEDDINGS ####
+            # Note (anas): Do not apply weight decay to embeddings as it will break this function.
+            # ! not an important point
+            # if args.ddp:
+            #     def mask_embedding(m):
+            #         if isinstance(m, torch.nn.Embedding) and m.weight.requires_grad:
+            #             zero_mask = torch.zeros_like(m.weight.grad)
+            #             zero_mask[media_token_id] = torch.ones_like(zero_mask[media_token_id])
+            #             zero_mask[endofmedia_token_id] = torch.ones_like(zero_mask[endofmedia_token_id])
+            #             m.weight.grad = m.weight.grad * zero_mask
+            #     model.apply(mask_embedding)
+            total_step += 1
+            scaler.unscale_(optimizer)
+            if args.ddp:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            else:
+                model.clip_grad_norm_(1.0)
+            scaler.step(optimizer)
+            scaler.update()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+            # https://github.com/facebookresearch/fairscale/issues/627
+            model.zero_grad(set_to_none=True)
+        if args.rank == 0 and total_step % args.logging_steps == 0 and total_step != last_logging_step:
+            last_logging_step = total_step
+            global_step = total_step
+            lr = optimizer.param_groups[0]["lr"]
+            writer.add_scalar("lr", lr, global_step)
+            writer.add_scalar("scale", scaler.get_scale(), global_step)
+            writer.add_scalar("loss_groundcaption", loss_laion.item(), global_step)
+            writer.add_scalar("loss_laion", loss_caption.item(), global_step)
+            writer.add_scalar("loss_pile", loss_pile.item(), global_step)
+            writer.add_scalar("loss", loss.item(), global_step)
+            writer.add_scalar("loss_det", loss_det.item(), global_step)
+            writer.add_scalar("loss_iou", loss_iou.item(), global_step)
+            writer.add_scalar("loss_obj", loss_obj.item(), global_step)
+            writer.add_scalar("loss_cls", loss_cls.item(), global_step)
+            if loss_rel.item() != 0:
+                writer.add_scalar("loss_rel", loss_rel.item(), global_step)
+            if args.use_format_v2:
+                writer.add_scalar("loss_iou_visual", visual_loss_iou.item(), global_step)
+                writer.add_scalar("loss_obj_visual", visual_loss_obj.item(), global_step)
+                writer.add_scalar("loss_iou_previsual", previsual_loss_iou.item(), global_step)
+                writer.add_scalar("loss_obj_previsual", previsual_loss_obj.item(), global_step)
+            global_sample_num = total_laion_sample
+            writer.add_scalar("loss_groundcaption_vs_sample_num", loss_laion.item(), global_sample_num)
+            writer.add_scalar("loss_laion_vs_sample_num", loss_caption.item(), global_sample_num)
+            writer.add_scalar("loss_pile_vs_sample_num", loss_pile.item(), global_sample_num)
+            writer.add_scalar("loss_vs_sample_num", loss.item(), global_sample_num)
+            writer.add_scalar("loss_det_vs_sample_num", loss_det.item(), global_sample_num)
+            writer.add_scalar("loss_iou_vs_sample_num", loss_iou.item(), global_sample_num)
+            writer.add_scalar("loss_obj_vs_sample_num", loss_obj.item(), global_sample_num)
+            if loss_rel.item() != 0:
+                writer.add_scalar("loss_rel_vs_sample_num", loss_rel.item(), global_sample_num)
+            writer.add_scalar("lr_vs_sample_num", optimizer.param_groups[0]["lr"], global_sample_num)
+            writer.add_scalar("loss_groundcaption_vs_token", loss_laion.item(), total_laion_token)
+            writer.add_scalar("loss_laion_vs_token", loss_caption.item(), total_laion_token)
+            writer.add_scalar("loss_pile_vs_token", loss_pile.item(), total_pile_token)
+            writer.add_scalar("loss_det_vs_token", loss_det.item(), total_laion_token)
+            writer.add_scalar("loss_iou_vs_token", loss_iou.item(), total_laion_token)
+            writer.add_scalar("loss_obj_vs_token", loss_obj.item(), total_laion_token)
+            writer.add_scalar("loss_cls_vs_token", loss_cls.item(), total_laion_token)
+            if loss_rel.item() != 0:
+                writer.add_scalar("loss_rel_vs_token", loss_rel.item(), total_laion_token)
+            total_token = total_laion_token + total_pile_token
+            writer.add_scalar("sample_num", global_sample_num, global_step)
+            writer.add_scalar("total_laion_token", total_laion_token, global_step)
+            writer.add_scalar("total_pile_token", total_pile_token, global_step)
+            writer.add_scalar("total_token", total_token, global_step)
+            logging.info(
+                f"[{global_step}][{total_laion_sample}][{total_token}]. total: {loss.item():.3f} //  laion: {loss_caption.item():.3f} // pile: {loss_pile.item():.3f} // iou: {loss_iou.item():.4f} // obj: {loss_obj.item():.4f} // previsual_obj: {previsual_loss_obj.item():.4f} // visual_obj: {visual_loss_obj.item():.4f} // previsual_iou: {previsual_loss_iou.item():.4f} // visual_iou: {visual_loss_iou.item():.4f} // lr: {lr:.2e} // scale: {scaler.get_scale()}"
+            )
+        if total_step % args.save_interval == 0 and total_step != last_save_step:
+            last_save_step = total_step
+            torch.distributed.barrier()
+            if args.ddp:
+                cpu_state = model.state_dict()
+                # if args.rank == 0:
+                #     optimizer_state = optimizer.state_dict()
+            else:
+                save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+                with FSDP.state_dict_type(
+                    model, StateDictType.FULL_STATE_DICT, save_policy
+                ):
+                    cpu_state = model.state_dict()
+                torch.distributed.barrier()
+                # https://pytorch.org/docs/1.12/fsdp.html
+                # need to pass optim_groups as optim_input
+                # optimizer_state = FSDP.full_optim_state_dict(model, optimizer, optim_input=optim_groups)
+            if args.rank == 0:
+                checkpoint_dict = {
+                    "model_state_dict": cpu_state,
+                    # "optimizer_state_dict": optimizer_state,
+                    "lr_scheduler_state_dict": lr_scheduler.state_dict(),
+                    "scaler_state_dict": scaler.state_dict(),
+                    "total_pile_token": total_pile_token,
+                    "total_laion_token": total_laion_token,
+                    "total_laion_sample": total_laion_sample,
+                    "total_step": total_step,
+                }
+                logging.info(f"Saving checkpoint to {args.run_name}/checkpoint_{total_step}.pt")
+                torch.save(checkpoint_dict, f"{args.run_name}/checkpoint_{total_step}.pt")
+                del checkpoint_dict
+                if args.delete_previous_checkpoint and total_step-args.save_interval > 0 and (total_step-args.save_interval) % args.skip_delete_pattern != 0:
+                    try:
+                        os.remove(f"{args.run_name}/checkpoint_{total_step-args.save_interval}.pt")
+                    except:
+                        pass
+            torch.distributed.barrier()
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count

multimodal/open_flamingo.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,247 @@

+Metadata-Version: 2.1
+Name: open-flamingo
+Version: 0.0.2
+Summary: An open-source framework for training large multimodal models
+License: MIT
+Keywords: machine learning
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+# 🦩 OpenFlamingo
+[![PyPI version](https://badge.fury.io/py/open_flamingo.svg)](https://badge.fury.io/py/open_flamingo)
+[Blog post](https://laion.ai/blog/open-flamingo/) | Paper (coming soon)
+Welcome to our open source version of DeepMind's [Flamingo](https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model) model! In this repository, we provide a PyTorch implementation for training and evaluating OpenFlamingo models. We also provide an initial [OpenFlamingo 9B model](https://huggingface.co/openflamingo/OpenFlamingo-9B) trained on a new Multimodal C4 dataset (coming soon). Please refer to our blog post for more details.
+This repo is still under development, and we hope to release better performing and larger OpenFlamingo models soon. If you have any questions, please feel free to open an issue. We also welcome contributions!
+# Table of Contents
+- [Installation](#installation)
+- [Approach](#approach)
+  * [Model architecture](#model-architecture)
+- [Usage](#usage)
+  * [Initializing an OpenFlamingo model](#initializing-an-openflamingo-model)
+  * [Generating text](#generating-text)
+- [Training](#training)
+  * [Dataset](#dataset)
+- [Evaluation](#evaluation)
+- [Future plans](#future-plans)
+- [Team](#team)
+- [Acknowledgments](#acknowledgments)
+- [Citing](#citing)
+# Installation
+To install the package in an existing environment, run
+```
+pip install open-flamingo
+```
+or to create a conda environment for running OpenFlamingo, run
+```
+conda env create -f environment.yml
+```
+# Usage
+We provide an initial [OpenFlamingo 9B model](https://huggingface.co/openflamingo/OpenFlamingo-9B) using a CLIP ViT-Large vision encoder and a LLaMA-7B language model. In general, we support any [CLIP vision encoder](https://huggingface.co/models?search=clip). For the language model, we support [LLaMA](https://huggingface.co/models?search=llama), [OPT](https://huggingface.co/models?search=opt), [GPT-Neo](https://huggingface.co/models?search=gpt-neo), [GPT-J](https://huggingface.co/models?search=gptj), and [Pythia](https://huggingface.co/models?search=pythia) models.
+#### NOTE: To use LLaMA models, you will need to install the latest version of transformers via
+```
+pip install git+https://github.com/huggingface/transformers
+```
+Use this [script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py) for converting LLaMA weights to HuggingFace format.
+## Initializing an OpenFlamingo model
+``` python
+from open_flamingo import create_model_and_transforms
+model, image_processor, tokenizer = create_model_and_transforms(
+    clip_vision_encoder_path="ViT-L-14",
+    clip_vision_encoder_pretrained="openai",
+    lang_encoder_path="<path to llama weights in HuggingFace format>",
+    tokenizer_path="<path to llama tokenizer in HuggingFace format>",
+    cross_attn_every_n_layers=4
+)
+# grab model checkpoint from huggingface hub
+from huggingface_hub import hf_hub_download
+import torch
+checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B", "checkpoint.pt")
+model.load_state_dict(torch.load(checkpoint_path), strict=False)
+```
+## Generating text
+Here is an example of generating text conditioned on interleaved images/text, in this case we will do few-shot image captioning.
+``` python
+from PIL import Image
+import requests
+"""
+Step 1: Load images
+"""
+demo_image_one = Image.open(
+    requests.get(
+        "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True
+    ).raw
+)
+demo_image_two = Image.open(
+    requests.get(
+        "http://images.cocodataset.org/test-stuff2017/000000028137.jpg",
+        stream=True
+    ).raw
+)
+query_image = Image.open(
+    requests.get(
+        "http://images.cocodataset.org/test-stuff2017/000000028352.jpg",
+        stream=True
+    ).raw
+)
+"""
+Step 2: Preprocessing images
+Details: For OpenFlamingo, we expect the image to be a torch tensor of shape
+ batch_size x num_media x num_frames x channels x height x width.
+ In this case batch_size = 1, num_media = 3, num_frames = 1
+ (this will always be one expect for video which we don't support yet),
+ channels = 3, height = 224, width = 224.
+"""
+vision_x = [image_processor(demo_image_one).unsqueeze(0), image_processor(demo_image_two).unsqueeze(0), image_processor(query_image).unsqueeze(0)]
+vision_x = torch.cat(vision_x, dim=0)
+vision_x = vision_x.unsqueeze(1).unsqueeze(0)
+"""
+Step 3: Preprocessing text
+Details: In the text we expect an <|#image#|> special token to indicate where an image is.
+ We also expect an <|endofchunk|> special token to indicate the end of the text
+ portion associated with an image.
+"""
+tokenizer.padding_side = "left" # For generation padding tokens should be on the left
+lang_x = tokenizer(
+    ["<|#image#|>An image of two cats.<|endofchunk|><|#image#|>An image of a bathroom sink.<|endofchunk|><|#image#|>An image of"],
+    return_tensors="pt",
+)
+"""
+Step 4: Generate text
+"""
+generated_text = model.generate(
+    vision_x=vision_x,
+    lang_x=lang_x["input_ids"],
+    attention_mask=lang_x["attention_mask"],
+    max_new_tokens=20,
+    num_beams=3,
+)
+print("Generated text: ", tokenizer.decode(generated_text[0]))
+```
+# Approach
+OpenFlamingo is a multimodal language model that can be used for a variety of tasks. It is trained on a large multimodal dataset (e.g. Multimodal C4) and can be used to generate text conditioned on interleaved images/text. For example, OpenFlamingo can be used to generate a caption for an image, or to generate a question given an image and a text passage. The benefit of this approach is that we are able to rapidly adapt to new tasks using in-context training.
+## Model architecture
+OpenFlamingo seeks to fuse a pretrained vision encoder and a language model using cross attention layers. The model architecture is shown below.
+![OpenFlamingo architecture](docs/flamingo.png)
+Credit: [Flamingo](https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model)
+# Training
+To train a model, modify the following example command, which uses OPT 1.3B as an example LM:
+```
+torchrun --nnodes=1 --nproc_per_node=4 train.py \
+--run_name flamingo3B \
+--lm_path facebook/opt-1.3b \
+--tokenizer_path facebook/opt-1.3b \
+--dataset_resampled \
+--laion_shards "/path/to/shards/shard-{0000..0999}.tar" \
+--mmc4_shards "/path/to/shards/shard-{0000..0999}.tar" \
+--batch_size_mmc4 4 \
+--batch_size_laion 8 \
+--train_num_samples_mmc4 125000 \
+--train_num_samples_laion 250000 \
+--loss_multiplier_laion 0.2 \
+--workers=6 \
+--num_epochs 250 \
+--lr_scheduler constant \
+--warmup_steps 5000 \
+--use_media_placement_augmentation \
+--mmc4_textsim_threshold 30
+```
+## Dataset
+We expect all our training datasets to be [WebDataset](https://github.com/webdataset/webdataset) shards.
+We train our models on the [LAION 2B](https://huggingface.co/datasets/laion/laion2B-en) and Multimodal C4 (coming soon) datasets. By default the LAION 2B dataset is in WebDataset format if it is downloaded using the [img2dataset tool](https://github.com/rom1504/img2dataset) and Multimodal C4 comes packaged in the WebDataset format.
+# Evaluation
+We currently support running evaluations on [COCO](https://cocodataset.org/#home), [VQAv2](https://visualqa.org/index.html), [OKVQA](https://okvqa.allenai.org), [Flickr30k](https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset), and [ImageNet](https://image-net.org/index.php). Note that currently these evaluations are ran in validation mode (as specified in the Flamingo paper). We will be adding support for running evaluations in test mode in the future.
+Before evaluating the model, you will need to install the coco evaluation package by running the following command:
+```
+pip install pycocoevalcap
+```
+To run evaluations on OKVQA you will need to run the following command:
+```
+import nltk
+nltk.download('wordnet')
+```
+To evaluate the model, run the script at `open_flamingo/scripts/run_eval.sh`
+# Future plans
+- [ ] Add support for video input
+- [ ] Release better performing and larger OpenFlamingo models
+- [ ] Expand our evaluation suite
+- [ ] Add support for FSDP training
+# Team
+OpenFlamingo is developed by:
+[Anas Awadalla](https://anas-awadalla.streamlit.app/), [Irena Gao](https://i-gao.github.io/), [Joshua Gardner](https://homes.cs.washington.edu/~jpgard/), [Jack Hessel](https://jmhessel.com/), [Yusuf Hanafy](https://www.linkedin.com/in/yusufhanafy/), [Wanrong Zhu](https://wanrong-zhu.com/), [Kalyani Marathe](https://sites.google.com/uw.edu/kalyanimarathe/home?authuser=0), [Yonatan Bitton](https://yonatanbitton.github.io/), [Samir Gadre](https://sagadre.github.io/), [Jenia Jitsev](https://scholar.google.de/citations?user=p1FuAMkAAAAJ&hl=en), [Simon Kornblith](https://simonster.com/), [Pang Wei Koh](https://koh.pw/), [Gabriel Ilharco](https://gabrielilharco.com/), [Mitchell Wortsman](https://mitchellnw.github.io/), [Ludwig Schmidt](https://people.csail.mit.edu/ludwigs/).
+The team is primarily from the University of Washington, Stanford, AI2, UCSB, and Google.
+# Acknowledgments
+This code is based on Lucidrains' [flamingo implementation](https://github.com/lucidrains/flamingo-pytorch) and David Hansmair's [flamingo-mini repo](https://github.com/dhansmair/flamingo-mini). Thank you for making your code public! We also thank the [OpenCLIP](https://github.com/mlfoundations/open_clip) team as we use their data loading code and take inspiration from their library design.
+We would also like to thank [Jean-Baptiste Alayrac](https://www.jbalayrac.com) and [Antoine Miech](https://antoine77340.github.io) for their advice, [Rohan Taori](https://www.rohantaori.com/), [Nicholas Schiefer](https://nicholasschiefer.com/), [Deep Ganguli](https://hai.stanford.edu/people/deep-ganguli), [Thomas Liao](https://thomasliao.com/), [Tatsunori Hashimoto](https://thashim.github.io/), and [Nicholas Carlini](https://nicholas.carlini.com/) for their help with assessing the safety risks of our release, and to [Stability AI](https://stability.ai) for providing us with compute resources to train these models.
+# Citing
+If you found this repository useful, please consider citing:
+```
+@software{anas_awadalla_2023_7733589,
+  author = {Awadalla, Anas and Gao, Irena and Gardner, Joshua and Hessel, Jack and Hanafy, Yusuf and Zhu, Wanrong and Marathe, Kalyani and Bitton, Yonatan and Gadre, Samir and Jitsev, Jenia and Kornblith, Simon and Koh, Pang Wei and Ilharco, Gabriel and Wortsman, Mitchell and Schmidt, Ludwig},
+  title = {OpenFlamingo},
+  month        = mar,
+  year         = 2023,
+  publisher    = {Zenodo},
+  version      = {v0.1.1},
+  doi          = {10.5281/zenodo.7733589},
+  url          = {https://doi.org/10.5281/zenodo.7733589}
+}
+```
+```
+@article{Alayrac2022FlamingoAV,
+  title={Flamingo: a Visual Language Model for Few-Shot Learning},
+  author={Jean-Baptiste Alayrac and Jeff Donahue and Pauline Luc and Antoine Miech and Iain Barr and Yana Hasson and Karel Lenc and Arthur Mensch and Katie Millican and Malcolm Reynolds and Roman Ring and Eliza Rutherford and Serkan Cabi and Tengda Han and Zhitao Gong and Sina Samangooei and Marianne Monteiro and Jacob Menick and Sebastian Borgeaud and Andy Brock and Aida Nematzadeh and Sahand Sharifzadeh and Mikolaj Binkowski and Ricardo Barreira and Oriol Vinyals and Andrew Zisserman and Karen Simonyan},
+  journal={ArXiv},
+  year={2022},
+  volume={abs/2204.14198}
+}
+```

multimodal/open_flamingo.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,53 @@

+LICENSE
+README.md
+setup.py
+open_flamingo/__init__.py
+open_flamingo.egg-info/PKG-INFO
+open_flamingo.egg-info/SOURCES.txt
+open_flamingo.egg-info/dependency_links.txt
+open_flamingo.egg-info/requires.txt
+open_flamingo.egg-info/top_level.txt
+open_flamingo/chat/__init__.py
+open_flamingo/chat/conversation.py
+open_flamingo/eval/__init__.py
+open_flamingo/eval/classification.py
+open_flamingo/eval/coco_metric.py
+open_flamingo/eval/eval_datasets.py
+open_flamingo/eval/evaluate.py
+open_flamingo/eval/evaluate_debug.py
+open_flamingo/eval/evaluate_find_showcase.py
+open_flamingo/eval/evaluate_temp.py
+open_flamingo/eval/imagenet_utils.py
+open_flamingo/eval/ok_vqa_utils.py
+open_flamingo/eval/vqa_metric.py
+open_flamingo/eval/dataset_zoo/__init__.py
+open_flamingo/eval/dataset_zoo/aro_datasets.py
+open_flamingo/eval/dataset_zoo/constants.py
+open_flamingo/eval/dataset_zoo/perturbations.py
+open_flamingo/eval/dataset_zoo/retrieval.py
+open_flamingo/eval/dataset_zoo/utils.py
+open_flamingo/eval/task/__init__.py
+open_flamingo/eval/task/caption.py
+open_flamingo/eval/task/caption_chat.py
+open_flamingo/eval/task/cola.py
+open_flamingo/eval/task/crepe.py
+open_flamingo/eval/task/gqa.py
+open_flamingo/eval/task/mmbench.py
+open_flamingo/eval/task/reg.py
+open_flamingo/eval/task/utils.py
+open_flamingo/eval/task/vl_checklist.py
+open_flamingo/src/__init__.py
+open_flamingo/src/attention.py
+open_flamingo/src/factory.py
+open_flamingo/src/flamingo.py
+open_flamingo/src/flamingo_lm.py
+open_flamingo/src/gcn.py
+open_flamingo/src/helpers.py
+open_flamingo/src/utils.py
+open_flamingo/train/__init__.py
+open_flamingo/train/data2.py
+open_flamingo/train/distributed.py
+open_flamingo/train/instruction_template.py
+open_flamingo/train/train.py
+open_flamingo/train/train_utils.py
+tests/test_flamingo_model.py

multimodal/open_flamingo.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

multimodal/open_flamingo.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+einops
+einops-exts
+transformers==4.31.0
+torch==1.12.1
+torchvision==0.13.1
+pillow==9.3.0
+more-itertools
+datasets==2.9.0
+braceexpand==0.1.7
+webdataset
+wandb==0.13.10
+nltk
+scipy
+inflection
+sentencepiece
+open_clip_torch==2.20.0
+opencv-python==4.7.0.68

multimodal/open_flamingo.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ open_flamingo