Spaces:
Runtime error
Runtime error
DongfuJiang
commited on
Commit
•
98cf109
1
Parent(s):
a5a9bfc
update
Browse files- app_regression.py +1 -1
- models/idefics2/modeling_idefics2.py +27 -9
app_regression.py
CHANGED
@@ -17,7 +17,7 @@ from typing import List
|
|
17 |
processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression")
|
18 |
model = Idefics2ForSequenceClassification.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression", torch_dtype=torch.bfloat16).eval()
|
19 |
|
20 |
-
MAX_NUM_FRAMES =
|
21 |
conv_template = conv_templates["idefics_2"]
|
22 |
|
23 |
with open("./examples/all_subsets.json", 'r') as f:
|
|
|
17 |
processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression")
|
18 |
model = Idefics2ForSequenceClassification.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression", torch_dtype=torch.bfloat16).eval()
|
19 |
|
20 |
+
MAX_NUM_FRAMES = 24
|
21 |
conv_template = conv_templates["idefics_2"]
|
22 |
|
23 |
with open("./examples/all_subsets.json", 'r') as f:
|
models/idefics2/modeling_idefics2.py
CHANGED
@@ -1658,15 +1658,33 @@ class Idefics2Model(Idefics2PreTrainedModel):
|
|
1658 |
patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
|
1659 |
|
1660 |
# Get sequence from the vision encoder
|
1661 |
-
|
1662 |
-
|
1663 |
-
|
1664 |
-
|
1665 |
-
|
1666 |
-
|
1667 |
-
|
1668 |
-
|
1669 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1670 |
|
1671 |
elif image_hidden_states is not None:
|
1672 |
image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
|
|
|
1658 |
patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
|
1659 |
|
1660 |
# Get sequence from the vision encoder
|
1661 |
+
pixel_batch_size = 4
|
1662 |
+
all_image_hidden_states = []
|
1663 |
+
for i in range(0, pixel_values.size(0), pixel_batch_size):
|
1664 |
+
batch_pixel_values = pixel_values[i : i + pixel_batch_size]
|
1665 |
+
batch_patch_attention_mask = patch_attention_mask[i : i + pixel_batch_size]
|
1666 |
+
|
1667 |
+
batch_image_hidden_states = self.vision_model(
|
1668 |
+
pixel_values=batch_pixel_values,
|
1669 |
+
patch_attention_mask=batch_patch_attention_mask,
|
1670 |
+
).last_hidden_state
|
1671 |
+
|
1672 |
+
batch_image_hidden_states = self.connector(
|
1673 |
+
batch_image_hidden_states, attention_mask=batch_patch_attention_mask.view(batch_pixel_values.size(0), -1)
|
1674 |
+
)
|
1675 |
+
all_image_hidden_states.append(batch_image_hidden_states)
|
1676 |
+
|
1677 |
+
image_hidden_states = torch.cat(all_image_hidden_states, dim=0)
|
1678 |
+
|
1679 |
+
# image_hidden_states = self.vision_model(
|
1680 |
+
# pixel_values=pixel_values,
|
1681 |
+
# patch_attention_mask=patch_attention_mask,
|
1682 |
+
# ).last_hidden_state
|
1683 |
+
|
1684 |
+
# # Modality projection & resampling
|
1685 |
+
# image_hidden_states = self.connector(
|
1686 |
+
# image_hidden_states, attention_mask=patch_attention_mask.view(pixel_values.size(0), -1)
|
1687 |
+
# )
|
1688 |
|
1689 |
elif image_hidden_states is not None:
|
1690 |
image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
|