VideoMAE finetuned for shot scale and movement classification
videomae-base-finetuned-kinetics model finetuned to classify:
- shot scale into five classes: ECS (Extreme close-up shot), CS (close-up shot), MS (medium shot), FS (full shot), LS (long shot)
- shot movement into four classes: Static, Motion, Pull, Push
Movienet dataset is used for finetuning the model for 5 epochs. v1_split_trailer.json provides the training, validation and test data splits.
Evaluation
Model achieves:
- shot scale accuracy of 88.32% and macro-f1 of 88.57%
- shot movement accuracy of 91.45% and macro-f1 of 80.8%
Class-wise accuracies:
- shot scale: ECS - 90.92%, CS - 83.2%, MS - 85.0%, FS - 89.71%, LS - 94.55%
- shot movement: Static - 94.6%, Motion - 87.7%, Pull - 57.5%, Push - 66.82%
Model Definition
from transformers import VideoMAEImageProcessor, VideoMAEModel, VideoMAEConfig, PreTrainedModel
class CustomVideoMAEConfig(VideoMAEConfig):
def __init__(self, scale_label2id=None, scale_id2label=None, movement_label2id=None, movement_id2label=None, **kwargs):
super().__init__(**kwargs)
self.scale_label2id = scale_label2id if scale_label2id is not None else {}
self.scale_id2label = scale_id2label if scale_id2label is not None else {}
self.movement_label2id = movement_label2id if movement_label2id is not None else {}
self.movement_id2label = movement_id2label if movement_id2label is not None else {}
class CustomModel(PreTrainedModel):
config_class = CustomVideoMAEConfig
def __init__(self, config, model_name, scale_num_classes, movement_num_classes):
super().__init__(config)
self.vmae = VideoMAEModel.from_pretrained(model_name, ignore_mismatched_sizes=True)
self.fc_norm = nn.LayerNorm(config.hidden_size) if config.use_mean_pooling else None
self.scale_cf = nn.Linear(config.hidden_size, scale_num_classes)
self.movement_cf = nn.Linear(config.hidden_size, movement_num_classes)
def forward(self, pixel_values, scale_labels=None, movement_labels=None):
vmae_outputs = self.vmae(pixel_values)
sequence_output = vmae_outputs[0]
if self.fc_norm is not None:
sequence_output = self.fc_norm(sequence_output.mean(1))
else:
sequence_output = sequence_output[:, 0]
scale_logits = self.scale_cf(sequence_output)
movement_logits = self.movement_cf(sequence_output)
if scale_labels is not None and movement_labels is not None:
loss = F.cross_entropy(scale_logits, scale_labels) + F.cross_entropy(movement_logits, movement_labels)
return {"loss": loss, "scale_logits": scale_logits, "movement_logits": movement_logits}
return {"scale_logits": scale_logits, "movement_logits": movement_logits}
scale_lab2id = {"ECS": 0, "CS": 1, "MS": 2, "FS": 3, "LS": 4}
scale_id2lab = {v:k for k,v in scale_lab2id.items()}
movement_lab2id = {"Static": 0, "Motion": 1, "Pull": 2, "Push": 3}
movement_id2lab = {v:k for k,v in movement_lab2id.items()}
config = CustomVideoMAEConfig(scale_lab2id, scale_id2lab, movement_lab2id, movement_id2lab)
model = CustomModel(config, model_name, 5, 4)
- Downloads last month
- 6,529
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.