omermazig commited on
Commit
2ac5dcc
·
1 Parent(s): 04b5589

Change app code to use my own:

Browse files

* model
* transformations
* inference method (multiple clips)

Files changed (1) hide show
  1. app.py +65 -98
app.py CHANGED
@@ -1,119 +1,86 @@
1
- import cv2
2
  import gradio as gr
3
- import imutils
4
- import numpy as np
5
  import torch
 
 
 
 
6
  from pytorchvideo.transforms import (
7
- ApplyTransformToKey,
8
  Normalize,
9
- RandomShortSideScale,
10
- RemoveKey,
11
- ShortSideScale,
12
- UniformTemporalSubsample,
13
  )
14
  from torchvision.transforms import (
15
  Compose,
16
  Lambda,
17
- RandomCrop,
18
- RandomHorizontalFlip,
19
- Resize,
20
  )
21
- from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
22
 
23
- MODEL_CKPT = "sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset"
24
- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
 
26
- MODEL = VideoMAEForVideoClassification.from_pretrained(MODEL_CKPT).to(DEVICE)
27
- PROCESSOR = VideoMAEFeatureExtractor.from_pretrained(MODEL_CKPT)
28
-
29
- RESIZE_TO = PROCESSOR.size["shortest_edge"]
30
- NUM_FRAMES_TO_SAMPLE = MODEL.config.num_frames
31
- IMAGE_STATS = {"image_mean": [0.485, 0.456, 0.406], "image_std": [0.229, 0.224, 0.225]}
32
- VAL_TRANSFORMS = Compose(
33
- [
34
- UniformTemporalSubsample(NUM_FRAMES_TO_SAMPLE),
35
- Lambda(lambda x: x / 255.0),
36
- Normalize(IMAGE_STATS["image_mean"], IMAGE_STATS["image_std"]),
37
- Resize((RESIZE_TO, RESIZE_TO)),
38
- ]
39
- )
40
- LABELS = list(MODEL.config.label2id.keys())
41
-
42
-
43
- def parse_video(video_file):
44
- """A utility to parse the input videos.
45
-
46
- Reference: https://pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv/
47
- """
48
- vs = cv2.VideoCapture(video_file)
49
-
50
- # try to determine the total number of frames in the video file
51
- try:
52
- prop = (
53
- cv2.cv.CV_CAP_PROP_FRAME_COUNT
54
- if imutils.is_cv2()
55
- else cv2.CAP_PROP_FRAME_COUNT
56
- )
57
- total = int(vs.get(prop))
58
- print("[INFO] {} total frames in video".format(total))
59
-
60
- # an error occurred while trying to determine the total
61
- # number of frames in the video file
62
- except:
63
- print("[INFO] could not determine # of frames in video")
64
- print("[INFO] no approx. completion time can be provided")
65
- total = -1
66
-
67
- frames = []
68
-
69
- # loop over frames from the video file stream
70
- while True:
71
- # read the next frame from the file
72
- (grabbed, frame) = vs.read()
73
- if frame is not None:
74
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
75
- frames.append(frame)
76
- # if the frame was not grabbed, then we have reached the end
77
- # of the stream
78
- if not grabbed:
79
- break
80
-
81
- return frames
82
-
83
-
84
- def preprocess_video(frames: list):
85
- """Utility to apply preprocessing transformations to a video tensor."""
86
- # Each frame in the `frames` list has the shape: (height, width, num_channels).
87
- # Collated together the `frames` has the the shape: (num_frames, height, width, num_channels).
88
- # So, after converting the `frames` list to a torch tensor, we permute the shape
89
- # such that it becomes (num_channels, num_frames, height, width) to make
90
- # the shape compatible with the preprocessing transformations. After applying the
91
- # preprocessing chain, we permute the shape to (num_frames, num_channels, height, width)
92
- # to make it compatible with the model. Finally, we add a batch dimension so that our video
93
- # classification model can operate on it.
94
- video_tensor = torch.tensor(np.array(frames).astype(frames[0].dtype))
95
- video_tensor = video_tensor.permute(
96
- 3, 0, 1, 2
97
- ) # (num_channels, num_frames, height, width)
98
- video_tensor_pp = VAL_TRANSFORMS(video_tensor)
99
- video_tensor_pp = video_tensor_pp.permute(
100
- 1, 0, 2, 3
101
- ) # (num_frames, num_channels, height, width)
102
- video_tensor_pp = video_tensor_pp.unsqueeze(0)
103
- return video_tensor_pp.to(DEVICE)
104
 
105
 
106
  def infer(video_file):
107
- frames = parse_video(video_file)
108
- video_tensor = preprocess_video(frames)
109
- inputs = {"pixel_values": video_tensor}
110
 
111
  # forward pass
112
  with torch.no_grad():
113
- outputs = MODEL(**inputs)
114
- logits = outputs.logits
 
115
  softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
116
- confidences = {LABELS[i]: float(softmax_scores[i]) for i in range(len(LABELS))}
117
  return confidences
118
 
119
 
 
 
1
  import gradio as gr
 
 
2
  import torch
3
+ from pytorchvideo.data import make_clip_sampler
4
+ from pytorchvideo.data.clip_sampling import ClipInfoList
5
+ from pytorchvideo.data.encoded_video_pyav import EncodedVideoPyAV
6
+ from pytorchvideo.data.video import VideoPathHandler
7
  from pytorchvideo.transforms import (
 
8
  Normalize,
9
+ UniformTemporalSubsample, RandomShortSideScale,
 
 
 
10
  )
11
  from torchvision.transforms import (
12
  Compose,
13
  Lambda,
14
+ Resize, RandomCrop,
 
 
15
  )
16
+ from transformers import pipeline
17
 
 
 
18
 
19
+ MODEL_CKPT = "omermazig/videomae-finetuned-nba-5-class-8-batch-8000-vid-multiclass_1697155188"
20
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
+ CLIPS_FROM_SINGLE_VIDEO = 5
22
+
23
+ pipe = pipeline("video-classification", model=MODEL_CKPT)
24
+ trained_model = pipe.model
25
+ image_processor = pipe.image_processor
26
+
27
+ mean = image_processor.image_mean
28
+ std = image_processor.image_std
29
+ if "shortest_edge" in image_processor.size:
30
+ height = width = image_processor.size["shortest_edge"]
31
+ else:
32
+ height = image_processor.size["height"]
33
+ width = image_processor.size["width"]
34
+ resize_to = (height, width)
35
+
36
+ num_frames_to_sample = trained_model.config.num_frames
37
+ sample_rate = 4
38
+ fps = 30
39
+ clip_duration = num_frames_to_sample * sample_rate / fps
40
+
41
+ # Validation and Test datasets' transformations.
42
+ inference_transform = Compose(
43
+ [
44
+ UniformTemporalSubsample(num_frames_to_sample),
45
+ Lambda(lambda x: x / 255.0),
46
+ Normalize(mean, std),
47
+ RandomShortSideScale(min_size=256, max_size=320),
48
+ RandomCrop(resize_to),
49
+ ]
50
+ )
51
+
52
+ labels = list(trained_model.config.label2id.keys())
53
+
54
+
55
+ def parse_video_to_clips(video_file):
56
+ """A utility to parse the input videos """
57
+ video_path_handler = VideoPathHandler()
58
+ video: EncodedVideoPyAV = video_path_handler.video_from_path(video_file)
59
+
60
+ clip_sampler = make_clip_sampler("random_multi", clip_duration, CLIPS_FROM_SINGLE_VIDEO)
61
+ # noinspection PyTypeChecker
62
+ clip_info: ClipInfoList = clip_sampler(0, video.duration, {})
63
+
64
+ video_clips_list = []
65
+ for clip_start, clip_end in zip(clip_info.clip_start_sec, clip_info.clip_end_sec):
66
+ video_clip = video.get_clip(clip_start, clip_end)["video"]
67
+ video_clips_list.append(inference_transform(video_clip))
68
+
69
+ videos_tensor = torch.stack([single_clip.permute(1, 0, 2, 3) for single_clip in video_clips_list])
70
+ return videos_tensor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
  def infer(video_file):
74
+ videos_tensor = parse_video_to_clips(video_file)
75
+ inputs = {"pixel_values": videos_tensor}
 
76
 
77
  # forward pass
78
  with torch.no_grad():
79
+ outputs = trained_model(**inputs)
80
+ multiple_logits = outputs.logits
81
+ logits = multiple_logits.sum(dim=0)
82
  softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
83
+ confidences = {labels[i]: float(softmax_scores[i]) for i in range(len(labels))}
84
  return confidences
85
 
86