freddyaboulton HF staff commited on
Commit
0350aec
1 Parent(s): 67e08d4

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +9 -41
  2. draw_boxes.py +17 -13
  3. requirements.txt +2 -4
  4. run.ipynb +1 -0
  5. run.py +115 -0
README.md CHANGED
@@ -1,44 +1,12 @@
 
1
  ---
2
- license: mit
3
- tags:
4
- - object-detection
5
- - computer-vision
6
- - yolov10
7
- datasets:
8
- - detection-datasets/coco
9
  sdk: gradio
10
- sdk_version: 4.42.0
 
 
 
11
  ---
12
-
13
- ### Model Description
14
- [YOLOv10: Real-Time End-to-End Object Detection](https://arxiv.org/abs/2405.14458v1)
15
-
16
- - arXiv: https://arxiv.org/abs/2405.14458v1
17
- - github: https://github.com/THU-MIG/yolov10
18
-
19
- ### Installation
20
- ```
21
- pip install supervision git+https://github.com/THU-MIG/yolov10.git
22
- ```
23
-
24
- ### Yolov10 Inference
25
- ```python
26
- from ultralytics import YOLOv10
27
- import supervision as sv
28
- import cv2
29
-
30
- IMAGE_PATH = 'dog.jpeg'
31
-
32
- model = YOLOv10.from_pretrained('jameslahm/yolov10{n/s/m/b/l/x}')
33
- model.predict(IMAGE_PATH, show=True)
34
- ```
35
-
36
- ### BibTeX Entry and Citation Info
37
- ```
38
- @article{wang2024yolov10,
39
- title={YOLOv10: Real-Time End-to-End Object Detection},
40
- author={Wang, Ao and Chen, Hui and Liu, Lihao and Chen, Kai and Lin, Zijia and Han, Jungong and Ding, Guiguang},
41
- journal={arXiv preprint arXiv:2405.14458},
42
- year={2024}
43
- }
44
- ```
 
1
+
2
  ---
3
+ title: rt-detr-object-detection
4
+ emoji: 🔥
5
+ colorFrom: indigo
6
+ colorTo: indigo
 
 
 
7
  sdk: gradio
8
+ sdk_version: 5.0.0
9
+ app_file: run.py
10
+ pinned: false
11
+ hf_oauth: true
12
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
draw_boxes.py CHANGED
@@ -1,7 +1,7 @@
1
- from PIL import Image, ImageDraw, ImageFont
2
- import numpy as np
3
  import colorsys
4
 
 
5
  def get_color(label):
6
  # Simple hash function to generate consistent colors for each label
7
  hash_value = hash(label)
@@ -11,31 +11,35 @@ def get_color(label):
11
  rgb = colorsys.hsv_to_rgb(hue, saturation, value)
12
  return tuple(int(x * 255) for x in rgb)
13
 
14
- def draw_bounding_boxes(image: Image, results: dict, model, threshold=0.3):
 
15
  draw = ImageDraw.Draw(image)
16
  font = ImageFont.load_default()
17
 
18
- for score, label_id, box in zip(results["scores"], results["labels"], results["boxes"]):
 
 
19
  if score > threshold:
20
  label = model.config.id2label[label_id.item()]
21
  box = [round(i, 2) for i in box.tolist()]
22
  color = get_color(label)
23
-
24
  # Draw bounding box
25
- draw.rectangle(box, outline=color, width=3)
26
-
27
  # Prepare text
28
  text = f"{label}: {score:.2f}"
29
  text_bbox = draw.textbbox((0, 0), text, font=font)
30
  text_width = text_bbox[2] - text_bbox[0]
31
  text_height = text_bbox[3] - text_bbox[1]
32
-
33
  # Draw text background
34
- draw.rectangle([box[0], box[1] - text_height - 4, box[0] + text_width, box[1]], fill=color)
35
-
 
 
 
36
  # Draw text
37
  draw.text((box[0], box[1] - text_height - 4), text, fill="white", font=font)
38
-
39
  return image
40
-
41
- import numpy as np
 
1
+ from PIL import ImageDraw, ImageFont # type: ignore
 
2
  import colorsys
3
 
4
+
5
  def get_color(label):
6
  # Simple hash function to generate consistent colors for each label
7
  hash_value = hash(label)
 
11
  rgb = colorsys.hsv_to_rgb(hue, saturation, value)
12
  return tuple(int(x * 255) for x in rgb)
13
 
14
+
15
+ def draw_bounding_boxes(image, results: dict, model, threshold=0.3):
16
  draw = ImageDraw.Draw(image)
17
  font = ImageFont.load_default()
18
 
19
+ for score, label_id, box in zip(
20
+ results["scores"], results["labels"], results["boxes"]
21
+ ):
22
  if score > threshold:
23
  label = model.config.id2label[label_id.item()]
24
  box = [round(i, 2) for i in box.tolist()]
25
  color = get_color(label)
26
+
27
  # Draw bounding box
28
+ draw.rectangle(box, outline=color, width=3) # type: ignore
29
+
30
  # Prepare text
31
  text = f"{label}: {score:.2f}"
32
  text_bbox = draw.textbbox((0, 0), text, font=font)
33
  text_width = text_bbox[2] - text_bbox[0]
34
  text_height = text_bbox[3] - text_bbox[1]
35
+
36
  # Draw text background
37
+ draw.rectangle(
38
+ [box[0], box[1] - text_height - 4, box[0] + text_width, box[1]], # type: ignore
39
+ fill=color, # type: ignore
40
+ )
41
+
42
  # Draw text
43
  draw.text((box[0], box[1] - text_height - 4), text, fill="white", font=font)
44
+
45
  return image
 
 
requirements.txt CHANGED
@@ -1,7 +1,5 @@
1
- --extra-index-url https://download.pytorch.org/whl/cu113
2
  safetensors==0.4.3
3
  opencv-python
4
  torch
5
- transformers
6
- gradio-client @ git+https://github.com/gradio-app/gradio@66349fe26827e3a3c15b738a1177e95fec7f5554#subdirectory=client/python
7
- https://gradio-pypi-previews.s3.amazonaws.com/66349fe26827e3a3c15b738a1177e95fec7f5554/gradio-4.42.0-py3-none-any.whl
 
 
1
  safetensors==0.4.3
2
  opencv-python
3
  torch
4
+ transformers>=4.43.0
5
+ Pillow
 
run.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: rt-detr-object-detection"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio safetensors==0.4.3 opencv-python torch transformers>=4.43.0 Pillow "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/rt-detr-object-detection/draw_boxes.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import spaces\n", "import gradio as gr\n", "import cv2\n", "from PIL import Image\n", "import torch\n", "import time\n", "import numpy as np\n", "import uuid\n", "\n", "from transformers import RTDetrForObjectDetection, RTDetrImageProcessor # type: ignore\n", "\n", "from draw_boxes import draw_bounding_boxes\n", "\n", "image_processor = RTDetrImageProcessor.from_pretrained(\"PekingU/rtdetr_r50vd\")\n", "model = RTDetrForObjectDetection.from_pretrained(\"PekingU/rtdetr_r50vd\").to(\"cuda\")\n", "\n", "\n", "SUBSAMPLE = 2\n", "\n", "\n", "@spaces.GPU\n", "def stream_object_detection(video, conf_threshold):\n", " cap = cv2.VideoCapture(video)\n", "\n", " video_codec = cv2.VideoWriter_fourcc(*\"mp4v\") # type: ignore\n", " fps = int(cap.get(cv2.CAP_PROP_FPS))\n", "\n", " desired_fps = fps // SUBSAMPLE\n", " width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) // 2\n", " height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) // 2\n", "\n", " iterating, frame = cap.read()\n", "\n", " n_frames = 0\n", "\n", " name = f\"output_{uuid.uuid4()}.mp4\"\n", " segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height)) # type: ignore\n", " batch = []\n", "\n", " while iterating:\n", " frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)\n", " frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n", " if n_frames % SUBSAMPLE == 0:\n", " batch.append(frame)\n", " if len(batch) == 2 * desired_fps:\n", " inputs = image_processor(images=batch, return_tensors=\"pt\").to(\"cuda\")\n", "\n", " print(f\"starting batch of size {len(batch)}\")\n", " start = time.time()\n", " with torch.no_grad():\n", " outputs = model(**inputs)\n", " end = time.time()\n", " print(\"time taken for inference\", end - start)\n", "\n", " start = time.time()\n", " boxes = image_processor.post_process_object_detection(\n", " outputs,\n", " target_sizes=torch.tensor([(height, width)] * len(batch)),\n", " threshold=conf_threshold,\n", " )\n", "\n", " for _, (array, box) in enumerate(zip(batch, boxes)):\n", " pil_image = draw_bounding_boxes(\n", " Image.fromarray(array), box, model, conf_threshold\n", " )\n", " frame = np.array(pil_image)\n", " # Convert RGB to BGR\n", " frame = frame[:, :, ::-1].copy()\n", " segment_file.write(frame)\n", "\n", " batch = []\n", " segment_file.release()\n", " yield name\n", " end = time.time()\n", " print(\"time taken for processing boxes\", end - start)\n", " name = f\"output_{uuid.uuid4()}.mp4\"\n", " segment_file = cv2.VideoWriter(\n", " name, video_codec, desired_fps, (width, height)\n", " ) # type: ignore\n", "\n", " iterating, frame = cap.read()\n", " n_frames += 1\n", "\n", "\n", "with gr.Blocks() as demo:\n", " gr.HTML(\n", " \"\"\"\n", " <h1 style='text-align: center'>\n", " Video Object Detection with <a href='https://huggingface.co/PekingU/rtdetr_r101vd_coco_o365' target='_blank'>RT-DETR</a>\n", " </h1>\n", " \"\"\"\n", " )\n", " with gr.Row():\n", " with gr.Column():\n", " video = gr.Video(label=\"Video Source\")\n", " conf_threshold = gr.Slider(\n", " label=\"Confidence Threshold\",\n", " minimum=0.0,\n", " maximum=1.0,\n", " step=0.05,\n", " value=0.30,\n", " )\n", " with gr.Column():\n", " output_video = gr.Video(\n", " label=\"Processed Video\", streaming=True, autoplay=True\n", " )\n", "\n", " video.upload(\n", " fn=stream_object_detection,\n", " inputs=[video, conf_threshold],\n", " outputs=[output_video],\n", " )\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
run.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import cv2
4
+ from PIL import Image
5
+ import torch
6
+ import time
7
+ import numpy as np
8
+ import uuid
9
+
10
+ from transformers import RTDetrForObjectDetection, RTDetrImageProcessor # type: ignore
11
+
12
+ from draw_boxes import draw_bounding_boxes
13
+
14
+ image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
15
+ model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd").to("cuda")
16
+
17
+
18
+ SUBSAMPLE = 2
19
+
20
+
21
+ @spaces.GPU
22
+ def stream_object_detection(video, conf_threshold):
23
+ cap = cv2.VideoCapture(video)
24
+
25
+ video_codec = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore
26
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
27
+
28
+ desired_fps = fps // SUBSAMPLE
29
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) // 2
30
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) // 2
31
+
32
+ iterating, frame = cap.read()
33
+
34
+ n_frames = 0
35
+
36
+ name = f"output_{uuid.uuid4()}.mp4"
37
+ segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height)) # type: ignore
38
+ batch = []
39
+
40
+ while iterating:
41
+ frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)
42
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
43
+ if n_frames % SUBSAMPLE == 0:
44
+ batch.append(frame)
45
+ if len(batch) == 2 * desired_fps:
46
+ inputs = image_processor(images=batch, return_tensors="pt").to("cuda")
47
+
48
+ print(f"starting batch of size {len(batch)}")
49
+ start = time.time()
50
+ with torch.no_grad():
51
+ outputs = model(**inputs)
52
+ end = time.time()
53
+ print("time taken for inference", end - start)
54
+
55
+ start = time.time()
56
+ boxes = image_processor.post_process_object_detection(
57
+ outputs,
58
+ target_sizes=torch.tensor([(height, width)] * len(batch)),
59
+ threshold=conf_threshold,
60
+ )
61
+
62
+ for _, (array, box) in enumerate(zip(batch, boxes)):
63
+ pil_image = draw_bounding_boxes(
64
+ Image.fromarray(array), box, model, conf_threshold
65
+ )
66
+ frame = np.array(pil_image)
67
+ # Convert RGB to BGR
68
+ frame = frame[:, :, ::-1].copy()
69
+ segment_file.write(frame)
70
+
71
+ batch = []
72
+ segment_file.release()
73
+ yield name
74
+ end = time.time()
75
+ print("time taken for processing boxes", end - start)
76
+ name = f"output_{uuid.uuid4()}.mp4"
77
+ segment_file = cv2.VideoWriter(
78
+ name, video_codec, desired_fps, (width, height)
79
+ ) # type: ignore
80
+
81
+ iterating, frame = cap.read()
82
+ n_frames += 1
83
+
84
+
85
+ with gr.Blocks() as demo:
86
+ gr.HTML(
87
+ """
88
+ <h1 style='text-align: center'>
89
+ Video Object Detection with <a href='https://huggingface.co/PekingU/rtdetr_r101vd_coco_o365' target='_blank'>RT-DETR</a>
90
+ </h1>
91
+ """
92
+ )
93
+ with gr.Row():
94
+ with gr.Column():
95
+ video = gr.Video(label="Video Source")
96
+ conf_threshold = gr.Slider(
97
+ label="Confidence Threshold",
98
+ minimum=0.0,
99
+ maximum=1.0,
100
+ step=0.05,
101
+ value=0.30,
102
+ )
103
+ with gr.Column():
104
+ output_video = gr.Video(
105
+ label="Processed Video", streaming=True, autoplay=True
106
+ )
107
+
108
+ video.upload(
109
+ fn=stream_object_detection,
110
+ inputs=[video, conf_threshold],
111
+ outputs=[output_video],
112
+ )
113
+
114
+ if __name__ == "__main__":
115
+ demo.launch()