omermazig's picture
Change app code to use my own:
e0d185f
raw
history blame
3.83 kB
import gradio as gr
import torch
from pytorchvideo.data import make_clip_sampler
from pytorchvideo.data.clip_sampling import ClipInfoList
from pytorchvideo.data.encoded_video_pyav import EncodedVideoPyAV
from pytorchvideo.data.video import VideoPathHandler
from pytorchvideo.transforms import (
Normalize,
UniformTemporalSubsample, RandomShortSideScale,
)
from torchvision.transforms import (
Compose,
Lambda,
Resize, RandomCrop,
)
from transformers import pipeline
MODEL_CKPT = "omermazig/videomae-finetuned-nba-5-class-4-batch-8000-vid-multiclass"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CLIPS_FROM_SINGLE_VIDEO = 5
pipe = pipeline("video-classification", model=MODEL_CKPT)
trained_model = pipe.model
image_processor = pipe.image_processor
mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
height = width = image_processor.size["shortest_edge"]
else:
height = image_processor.size["height"]
width = image_processor.size["width"]
resize_to = (height, width)
num_frames_to_sample = trained_model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps
# Validation and Test datasets' transformations.
inference_transform = Compose(
[
UniformTemporalSubsample(num_frames_to_sample),
Lambda(lambda x: x / 255.0),
Normalize(mean, std),
RandomShortSideScale(min_size=256, max_size=320),
RandomCrop(resize_to),
]
)
labels = list(trained_model.config.label2id.keys())
def parse_video_to_clips(video_file):
"""A utility to parse the input videos """
video_path_handler = VideoPathHandler()
video: EncodedVideoPyAV = video_path_handler.video_from_path(video_file)
clip_sampler = make_clip_sampler("random_multi", clip_duration, CLIPS_FROM_SINGLE_VIDEO)
# noinspection PyTypeChecker
clip_info: ClipInfoList = clip_sampler(0, video.duration, {})
video_clips_list = []
for clip_start, clip_end in zip(clip_info.clip_start_sec, clip_info.clip_end_sec):
video_clip = video.get_clip(clip_start, clip_end)["video"]
video_clips_list.append(inference_transform(video_clip))
videos_tensor = torch.stack([single_clip.permute(1, 0, 2, 3) for single_clip in video_clips_list])
return videos_tensor
def infer(video_file):
videos_tensor = parse_video_to_clips(video_file)
inputs = {"pixel_values": videos_tensor}
# forward pass
with torch.no_grad():
outputs = trained_model(**inputs)
multiple_logits = outputs.logits
logits = multiple_logits.sum(dim=0)
softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
confidences = {labels[i]: float(softmax_scores[i]) for i in range(len(labels))}
return confidences
gr.Interface(
fn=infer,
inputs=gr.Video(type="file"),
outputs=gr.Label(num_top_classes=3),
examples=[
["examples/babycrawling.mp4"],
["examples/baseball.mp4"],
["examples/balancebeam.mp4"],
],
title="VideoMAE fine-tuned on a subset of UCF-101",
description=(
"Gradio demo for VideoMAE for video classification. To use it, simply upload your video or click one of the"
" examples to load them. Read more at the links below."
),
article=(
"<div style='text-align: center;'><a href='https://huggingface.co/docs/transformers/model_doc/videomae' target='_blank'>VideoMAE</a>"
" <center><a href='https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset' target='_blank'>Fine-tuned Model</a></center></div>"
),
allow_flagging=False,
allow_screenshot=False,
).launch()