rt-detr-object-detection-webrtc

Running on Zero

App Files Files Community

freddyaboulton HF staff commited on Sep 13

Commit

4467a7b

•

1 Parent(s): 619c27a

Fix

Browse files

Files changed (2) hide show

app.py +12 -11
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -5,23 +5,25 @@ from PIL import Image
 import torch
 import time
 import numpy as np
 from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
 from draw_boxes import draw_bounding_boxes
 image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
-model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")
-SUBSAMPLE = 10
 @spaces.GPU
 def stream_object_detection(video, conf_threshold):
     cap = cv2.VideoCapture(video)
-    video_codec = cv2.VideoWriter_fourcc(*"x264") # type: ignore
     fps = int(cap.get(cv2.CAP_PROP_FPS))
     desired_fps = fps // SUBSAMPLE
     width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) // 2
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) // 2
@@ -29,9 +31,8 @@ def stream_object_detection(video, conf_threshold):
     iterating, frame = cap.read()
     n_frames = 0
-    n_chunks = 0
-    name = f"output_{n_chunks}.ts"
     segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height)) # type: ignore
     batch = []
@@ -41,15 +42,16 @@ def stream_object_detection(video, conf_threshold):
         if n_frames % SUBSAMPLE == 0:
             batch.append(frame)
         if len(batch) == 2 * desired_fps:
-            inputs = image_processor(images=batch, return_tensors="pt")
             print(f"starting batch of size {len(batch)}")
             start = time.time()
             with torch.no_grad():
                 outputs = model(**inputs)
             end = time.time()
-            print("time taken ", end - start)
             boxes = image_processor.post_process_object_detection(
                 outputs,
                 target_sizes=torch.tensor([(height, width)] * len(batch)),
@@ -57,7 +59,6 @@ def stream_object_detection(video, conf_threshold):
             for i, (array, box) in enumerate(zip(batch, boxes)):
                 pil_image = draw_bounding_boxes(Image.fromarray(array), box, model, conf_threshold)
-                pil_image.save(f"batch_{n_chunks}_detection_{i}.png")
                 frame = np.array(pil_image)
                 # Convert RGB to BGR
                 frame = frame[:, :, ::-1].copy()
@@ -66,9 +67,9 @@ def stream_object_detection(video, conf_threshold):
             batch = []
             segment_file.release()
             yield name
-            n_frames = 0
-            n_chunks += 1
-            name = f"output_{n_chunks}.ts"
             segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height)) # type: ignore
         iterating, frame = cap.read()

 import torch
 import time
 import numpy as np
+import uuid
 from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
 from draw_boxes import draw_bounding_boxes
 image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
+model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd").to("cuda")
+SUBSAMPLE = 2
 @spaces.GPU
 def stream_object_detection(video, conf_threshold):
     cap = cv2.VideoCapture(video)
+    video_codec = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore
     fps = int(cap.get(cv2.CAP_PROP_FPS))
     desired_fps = fps // SUBSAMPLE
     width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) // 2
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) // 2
     iterating, frame = cap.read()
     n_frames = 0
+    name = f"output_{uuid.uuid4()}.mp4"
     segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height)) # type: ignore
     batch = []
         if n_frames % SUBSAMPLE == 0:
             batch.append(frame)
         if len(batch) == 2 * desired_fps:
+            inputs = image_processor(images=batch, return_tensors="pt").to("cuda")
             print(f"starting batch of size {len(batch)}")
             start = time.time()
             with torch.no_grad():
                 outputs = model(**inputs)
             end = time.time()
+            print("time taken for inference", end - start)
+            start = time.time()
             boxes = image_processor.post_process_object_detection(
                 outputs,
                 target_sizes=torch.tensor([(height, width)] * len(batch)),
             for i, (array, box) in enumerate(zip(batch, boxes)):
                 pil_image = draw_bounding_boxes(Image.fromarray(array), box, model, conf_threshold)
                 frame = np.array(pil_image)
                 # Convert RGB to BGR
                 frame = frame[:, :, ::-1].copy()
             batch = []
             segment_file.release()
             yield name
+            end = time.time()
+            print("time taken for processing boxes", end - start)
+            name = f"output_{uuid.uuid4()}.mp4"
             segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height)) # type: ignore
         iterating, frame = cap.read()

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 safetensors==0.4.3
 opencv-python
 torch

+--extra-index-url https://download.pytorch.org/whl/cu113
 safetensors==0.4.3
 opencv-python
 torch