Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,837 Bytes
780389c 8b2cbe6 1e8e71b 6a95f1f 61732db 8b2cbe6 6a95f1f ccc35d4 619c27a 1e8e71b 6a95f1f 619c27a 6a95f1f 619c27a 6a95f1f 61732db 619c27a 6a95f1f b7278d2 6a95f1f 619c27a 6a95f1f 61732db 6a95f1f 61732db 6a95f1f 61732db 6a95f1f 619c27a 6a95f1f 619c27a 61732db 619c27a 61732db 6a95f1f 619c27a 6a95f1f 619c27a 6a95f1f 619c27a 6a95f1f 619c27a 9740995 619c27a ccc35d4 8b2cbe6 6a95f1f 8b2cbe6 9740995 8b2cbe6 6a95f1f 8b2cbe6 9740995 619c27a 9740995 ccc35d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import spaces
import gradio as gr
import cv2
from PIL import Image
import torch
import time
import numpy as np
from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
from draw_boxes import draw_bounding_boxes
image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")
SUBSAMPLE = 10
@spaces.GPU
def stream_object_detection(video, conf_threshold):
cap = cv2.VideoCapture(video)
video_codec = cv2.VideoWriter_fourcc(*"x264") # type: ignore
fps = int(cap.get(cv2.CAP_PROP_FPS))
desired_fps = fps // SUBSAMPLE
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) // 2
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) // 2
iterating, frame = cap.read()
n_frames = 0
n_chunks = 0
name = f"output_{n_chunks}.ts"
segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height)) # type: ignore
batch = []
while iterating:
frame = cv2.resize( frame, (0,0), fx=0.5, fy=0.5)
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
if n_frames % SUBSAMPLE == 0:
batch.append(frame)
if len(batch) == 2 * desired_fps:
inputs = image_processor(images=batch, return_tensors="pt")
print(f"starting batch of size {len(batch)}")
start = time.time()
with torch.no_grad():
outputs = model(**inputs)
end = time.time()
print("time taken ", end - start)
boxes = image_processor.post_process_object_detection(
outputs,
target_sizes=torch.tensor([(height, width)] * len(batch)),
threshold=conf_threshold)
for i, (array, box) in enumerate(zip(batch, boxes)):
pil_image = draw_bounding_boxes(Image.fromarray(array), box, model, conf_threshold)
pil_image.save(f"batch_{n_chunks}_detection_{i}.png")
frame = np.array(pil_image)
# Convert RGB to BGR
frame = frame[:, :, ::-1].copy()
segment_file.write(frame)
batch = []
segment_file.release()
yield name
n_frames = 0
n_chunks += 1
name = f"output_{n_chunks}.ts"
segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height)) # type: ignore
iterating, frame = cap.read()
n_frames += 1
# css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
# .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
css=""
with gr.Blocks(css=css) as app:
gr.HTML(
"""
<h1 style='text-align: center'>
Video Object Detection with RT-DETR
</h1>
""")
gr.HTML(
"""
<h3 style='text-align: center'>
<a href='https://arxiv.org/abs/2304.08069' target='_blank'>arXiv</a> | <a href='https://huggingface.co/PekingU/rtdetr_r101vd_coco_o365' target='_blank'>github</a>
</h3>
""")
with gr.Row():
with gr.Column():
with gr.Group(elem_classes=["my-group"]):
video = gr.Video(label="Video Source")
conf_threshold = gr.Slider(
label="Confidence Threshold",
minimum=0.0,
maximum=1.0,
step=0.05,
value=0.30,
)
with gr.Column():
output_video = gr.Video(label="Processed Video", streaming=True, autoplay=True)
video.upload(
fn=stream_object_detection,
inputs=[video, conf_threshold],
outputs=[output_video],
)
if __name__ == '__main__':
app.launch()
|