VideoScore

Runtime error

App Files Files Community

DongfuJiang commited on Jun 4

Commit

98cf109

•

1 Parent(s): a5a9bfc

update

Browse files

Files changed (2) hide show

app_regression.py +1 -1
models/idefics2/modeling_idefics2.py +27 -9

app_regression.py CHANGED Viewed

@@ -17,7 +17,7 @@ from typing import List
 processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression")
 model = Idefics2ForSequenceClassification.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression", torch_dtype=torch.bfloat16).eval()
-MAX_NUM_FRAMES = 16
 conv_template = conv_templates["idefics_2"]
 with open("./examples/all_subsets.json", 'r') as f:

 processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression")
 model = Idefics2ForSequenceClassification.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression", torch_dtype=torch.bfloat16).eval()
+MAX_NUM_FRAMES = 24
 conv_template = conv_templates["idefics_2"]
 with open("./examples/all_subsets.json", 'r') as f:

models/idefics2/modeling_idefics2.py CHANGED Viewed

@@ -1658,15 +1658,33 @@ class Idefics2Model(Idefics2PreTrainedModel):
             patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
             # Get sequence from the vision encoder
-            image_hidden_states = self.vision_model(
-                pixel_values=pixel_values,
-                patch_attention_mask=patch_attention_mask,
-            ).last_hidden_state
-            # Modality projection & resampling
-            image_hidden_states = self.connector(
-                image_hidden_states, attention_mask=patch_attention_mask.view(pixel_values.size(0), -1)
-            )
         elif image_hidden_states is not None:
             image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)

             patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
             # Get sequence from the vision encoder
+            pixel_batch_size = 4
+            all_image_hidden_states = []
+            for i in range(0, pixel_values.size(0), pixel_batch_size):
+                batch_pixel_values = pixel_values[i : i + pixel_batch_size]
+                batch_patch_attention_mask = patch_attention_mask[i : i + pixel_batch_size]
+                batch_image_hidden_states = self.vision_model(
+                    pixel_values=batch_pixel_values,
+                    patch_attention_mask=batch_patch_attention_mask,
+                ).last_hidden_state
+                batch_image_hidden_states = self.connector(
+                    batch_image_hidden_states, attention_mask=batch_patch_attention_mask.view(batch_pixel_values.size(0), -1)
+                )
+                all_image_hidden_states.append(batch_image_hidden_states)
+            image_hidden_states = torch.cat(all_image_hidden_states, dim=0)
+            # image_hidden_states = self.vision_model(
+            #     pixel_values=pixel_values,
+            #     patch_attention_mask=patch_attention_mask,
+            # ).last_hidden_state
+            # # Modality projection & resampling
+            # image_hidden_states = self.connector(
+            #     image_hidden_states, attention_mask=patch_attention_mask.view(pixel_values.size(0), -1)
+            # )
         elif image_hidden_states is not None:
             image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)