File size: 3,837 Bytes
780389c
8b2cbe6
1e8e71b
6a95f1f
61732db
 
 
8b2cbe6
6a95f1f
 
 
 
 
 
ccc35d4
619c27a
 
 
1e8e71b
6a95f1f
 
 
619c27a
6a95f1f
619c27a
 
 
6a95f1f
 
 
 
 
61732db
619c27a
 
6a95f1f
 
 
b7278d2
6a95f1f
619c27a
6a95f1f
61732db
6a95f1f
 
61732db
 
6a95f1f
 
61732db
 
6a95f1f
 
 
619c27a
6a95f1f
 
619c27a
61732db
619c27a
61732db
6a95f1f
 
 
 
619c27a
6a95f1f
619c27a
6a95f1f
 
619c27a
 
6a95f1f
 
 
 
619c27a
 
9740995
619c27a
ccc35d4
8b2cbe6
 
 
6a95f1f
8b2cbe6
9740995
8b2cbe6
 
 
6a95f1f
8b2cbe6
9740995
619c27a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9740995
 
ccc35d4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import spaces
import gradio as gr
import cv2
from PIL import Image
import torch
import time
import numpy as np

from transformers import RTDetrForObjectDetection, RTDetrImageProcessor

from draw_boxes import draw_bounding_boxes

image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")


SUBSAMPLE = 10

@spaces.GPU
def stream_object_detection(video, conf_threshold):
    cap = cv2.VideoCapture(video)

    video_codec = cv2.VideoWriter_fourcc(*"x264") # type: ignore
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    desired_fps = fps // SUBSAMPLE
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) // 2
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) // 2

    iterating, frame = cap.read()

    n_frames = 0
    n_chunks = 0

    name = f"output_{n_chunks}.ts"
    segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height)) # type: ignore
    batch = []

    while iterating:
        frame = cv2.resize( frame, (0,0), fx=0.5, fy=0.5)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        if n_frames % SUBSAMPLE == 0:
            batch.append(frame)
        if len(batch) == 2 * desired_fps:
            inputs = image_processor(images=batch, return_tensors="pt")

            print(f"starting batch of size {len(batch)}")
            start = time.time()
            with torch.no_grad():
                outputs = model(**inputs)
            end = time.time()
            print("time taken ", end - start)

            boxes = image_processor.post_process_object_detection(
                outputs,
                target_sizes=torch.tensor([(height, width)] * len(batch)),
                threshold=conf_threshold)
            
            for i, (array, box) in enumerate(zip(batch, boxes)):
                pil_image = draw_bounding_boxes(Image.fromarray(array), box, model, conf_threshold)
                pil_image.save(f"batch_{n_chunks}_detection_{i}.png")
                frame = np.array(pil_image)
                # Convert RGB to BGR
                frame = frame[:, :, ::-1].copy()
                segment_file.write(frame)

            batch = []
            segment_file.release()
            yield name
            n_frames = 0
            n_chunks += 1
            name = f"output_{n_chunks}.ts"
            segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height)) # type: ignore

        iterating, frame = cap.read()
        n_frames += 1

# css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
#                       .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""

css=""
with gr.Blocks(css=css) as app:
    gr.HTML(
        """
    <h1 style='text-align: center'>
    Video Object Detection with RT-DETR
    </h1>
    """)
    gr.HTML(
        """
        <h3 style='text-align: center'>
        <a href='https://arxiv.org/abs/2304.08069' target='_blank'>arXiv</a> | <a href='https://huggingface.co/PekingU/rtdetr_r101vd_coco_o365' target='_blank'>github</a>
        </h3>
        """)
    with gr.Row():
        with gr.Column():
            with gr.Group(elem_classes=["my-group"]):
                video = gr.Video(label="Video Source")
                conf_threshold = gr.Slider(
                    label="Confidence Threshold",
                    minimum=0.0,
                    maximum=1.0,
                    step=0.05,
                    value=0.30,
                )
        with gr.Column():
            output_video = gr.Video(label="Processed Video", streaming=True, autoplay=True)

    video.upload(
        fn=stream_object_detection,
        inputs=[video, conf_threshold],
        outputs=[output_video],
    )

if __name__ == '__main__':
    app.launch()