chendl commited on
Commit
0b7b08a
1 Parent(s): 95ddeec

Add application file

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +373 -0
  2. multimodal/HISTORY.md +3 -0
  3. multimodal/LICENSE +21 -0
  4. multimodal/MODEL_CARD.md +44 -0
  5. multimodal/Makefile +19 -0
  6. multimodal/README.md +233 -0
  7. multimodal/YOLOX/.gitignore +228 -0
  8. multimodal/YOLOX/.pre-commit-config.yaml +43 -0
  9. multimodal/YOLOX/.readthedocs.yaml +21 -0
  10. multimodal/YOLOX/LICENSE +201 -0
  11. multimodal/YOLOX/MANIFEST.in +2 -0
  12. multimodal/YOLOX/README.md +255 -0
  13. multimodal/YOLOX/demo/MegEngine/cpp/README.md +173 -0
  14. multimodal/YOLOX/demo/MegEngine/cpp/build.sh +61 -0
  15. multimodal/YOLOX/demo/MegEngine/cpp/yolox.cpp +470 -0
  16. multimodal/YOLOX/demo/MegEngine/python/README.md +33 -0
  17. multimodal/YOLOX/demo/MegEngine/python/build.py +53 -0
  18. multimodal/YOLOX/demo/MegEngine/python/convert_weights.py +64 -0
  19. multimodal/YOLOX/demo/MegEngine/python/demo.py +237 -0
  20. multimodal/YOLOX/demo/MegEngine/python/dump.py +51 -0
  21. multimodal/YOLOX/demo/MegEngine/python/models/__init__.py +9 -0
  22. multimodal/YOLOX/demo/MegEngine/python/models/darknet.py +154 -0
  23. multimodal/YOLOX/demo/MegEngine/python/models/network_blocks.py +183 -0
  24. multimodal/YOLOX/demo/MegEngine/python/models/yolo_fpn.py +78 -0
  25. multimodal/YOLOX/demo/MegEngine/python/models/yolo_head.py +192 -0
  26. multimodal/YOLOX/demo/MegEngine/python/models/yolo_pafpn.py +111 -0
  27. multimodal/YOLOX/demo/MegEngine/python/models/yolox.py +34 -0
  28. multimodal/YOLOX/demo/ONNXRuntime/README.md +78 -0
  29. multimodal/YOLOX/demo/ONNXRuntime/onnx_inference.py +86 -0
  30. multimodal/YOLOX/demo/OpenVINO/README.md +4 -0
  31. multimodal/YOLOX/demo/OpenVINO/cpp/CMakeLists.txt +23 -0
  32. multimodal/YOLOX/demo/OpenVINO/cpp/README.md +97 -0
  33. multimodal/YOLOX/demo/OpenVINO/cpp/yolox_openvino.cpp +529 -0
  34. multimodal/YOLOX/demo/OpenVINO/python/README.md +89 -0
  35. multimodal/YOLOX/demo/OpenVINO/python/openvino_inference.py +156 -0
  36. multimodal/YOLOX/demo/TensorRT/cpp/CMakeLists.txt +36 -0
  37. multimodal/YOLOX/demo/TensorRT/cpp/README.md +48 -0
  38. multimodal/YOLOX/demo/TensorRT/cpp/logging.h +503 -0
  39. multimodal/YOLOX/demo/TensorRT/cpp/yolox.cpp +530 -0
  40. multimodal/YOLOX/demo/TensorRT/python/README.md +46 -0
  41. multimodal/YOLOX/demo/ncnn/README.md +8 -0
  42. multimodal/YOLOX/demo/ncnn/android/README.md +27 -0
  43. multimodal/YOLOX/demo/ncnn/android/app/build.gradle +24 -0
  44. multimodal/YOLOX/demo/ncnn/android/app/src/main/AndroidManifest.xml +15 -0
  45. multimodal/YOLOX/demo/ncnn/android/app/src/main/assets/yolox.param +222 -0
  46. multimodal/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/MainActivity.java +247 -0
  47. multimodal/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/YOLOXncnn.java +27 -0
  48. multimodal/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/yoloXncnn.java +27 -0
  49. multimodal/YOLOX/demo/ncnn/android/app/src/main/jni/CMakeLists.txt +14 -0
  50. multimodal/YOLOX/demo/ncnn/android/app/src/main/jni/yoloXncnn_jni.cpp +474 -0
app.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system("cd open_flamingo && pip install .")
3
+ import numpy as np
4
+ import torch
5
+ from PIL import Image
6
+
7
+
8
+ import string
9
+ import cv2
10
+
11
+
12
+ import gradio as gr
13
+ import torch
14
+ from PIL import Image
15
+ from huggingface_hub import hf_hub_download, login
16
+
17
+ from open_flamingo.src.factory import create_model_and_transforms
18
+ flamingo, image_processor, tokenizer, vis_embed_size = create_model_and_transforms(
19
+ "ViT-L-14",
20
+ "datacomp_xl_s13b_b90k",
21
+ "EleutherAI/pythia-1.4b",
22
+ "EleutherAI/pythia-1.4b",
23
+ add_visual_grounding=True,
24
+ location_token_num=1000,
25
+ add_visual_token = True,
26
+ use_format_v2 = True,
27
+ )
28
+
29
+ checkpoint_path = hf_hub_download("chendl/compositional_test", "pythiaS.pt")
30
+ checkpoint = torch.load(checkpoint_path, map_location="cpu")
31
+ model_state_dict = {}
32
+ for key in checkpoint.keys():
33
+ model_state_dict[key.replace("module.", "")] = checkpoint[key]
34
+ if "vision_encoder.logit_scale"in model_state_dict:
35
+ # previous checkpoint has some unnecessary weights
36
+ del model_state_dict["vision_encoder.logit_scale"]
37
+ del model_state_dict["vision_encoder.visual.proj"]
38
+ del model_state_dict["vision_encoder.visual.ln_post.weight"]
39
+ del model_state_dict["vision_encoder.visual.ln_post.bias"]
40
+ flamingo.load_state_dict(model_state_dict, strict=True)
41
+
42
+ def get_outputs(
43
+ model,
44
+ batch_images,
45
+ attention_mask,
46
+ max_generation_length,
47
+ min_generation_length,
48
+ num_beams,
49
+ length_penalty,
50
+ input_ids,
51
+ image_start_index_list=None,
52
+ image_nums=None,
53
+ bad_words_ids=None,
54
+ ):
55
+ # and torch.cuda.amp.autocast(dtype=torch.float16)
56
+ with torch.inference_mode():
57
+ outputs = model.generate(
58
+ batch_images,
59
+ input_ids,
60
+ attention_mask=attention_mask,
61
+ max_new_tokens=max_generation_length,
62
+ min_length=min_generation_length,
63
+ num_beams=num_beams,
64
+ length_penalty=length_penalty,
65
+ image_start_index_list=image_start_index_list,
66
+ image_nums=image_nums,
67
+ bad_words_ids=bad_words_ids,
68
+ )
69
+
70
+ return outputs
71
+
72
+
73
+ def evaluate_refcoco(
74
+ model,
75
+ tokenizer,
76
+ image_processor,
77
+ batch_size,
78
+ tsvfile,
79
+ max_generation_length=20,
80
+ num_beams=3,
81
+ length_penalty=-2.0,
82
+ device=-1,
83
+ vis_embed_size=None,
84
+ rank=0,
85
+ world_size=1,
86
+ id=0,
87
+ ):
88
+ model.eval().cuda()
89
+ loc_token_ids = []
90
+ for i in range(1000):
91
+ loc_token_ids.append(int(tokenizer(f"<loc_{i}>", add_special_tokens=False)["input_ids"][-1]))
92
+ media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
93
+ endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
94
+ pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
95
+ bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
96
+ prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
97
+ # all_ids = set(range(model.lang_encoder.lm_head.out_features))
98
+ # bad_words_ids = list(all_ids - set(loc_token_ids))
99
+ # bad_words_ids = [[b] for b in bad_words_ids]
100
+ # min_loc_token_id = min(loc_token_ids)
101
+ # max_loc_token_id = max(loc_token_ids)
102
+ total = 0
103
+ correct = 0
104
+ ious = []
105
+ if "refcocog" in tsvfile:
106
+ dataset_name = "refcocog"
107
+ elif "refcocoplus" in tsvfile:
108
+ dataset_name = "refcocoplus"
109
+ else:
110
+ dataset_name = "refcoco"
111
+ with open(tsvfile, "r") as f:
112
+ lines = f.readlines()
113
+ pbar = tqdm(lines, disable=(rank != 0))
114
+ for ii, line in enumerate(pbar):
115
+ if ii % world_size != rank:
116
+ continue
117
+ total += 1
118
+ line = line.rstrip()
119
+ uniq_id, image_id, text, region_coord, image = line.split("\t")
120
+
121
+ image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB")
122
+ # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/yolo.png").convert("RGB")
123
+ # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/cat.png").convert("RGB")
124
+ # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/262148000.png")
125
+
126
+ gt_box = np.array(list(map(float, region_coord.split(","))))
127
+ width = image.width
128
+ height = image.height
129
+ image = image.resize((224, 224))
130
+ gt_box = gt_box / np.array([width, height, width, height]) * 224
131
+ batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
132
+ prompt = [
133
+ f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token * vis_embed_size}<|#endofimage#|><|#object#|>{text.rstrip('.').strip()}<|#endofobject#|><|#visual#|>"]
134
+ # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>the cat<|#visual#|>"]
135
+ # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"]
136
+ # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>a man<|#visual#|> is doing a trick on a skateboard<|#visual#|>"]
137
+
138
+ encodings = tokenizer(
139
+ prompt,
140
+ padding="longest",
141
+ truncation=True,
142
+ return_tensors="pt",
143
+ max_length=2000,
144
+ )
145
+ input_ids = encodings["input_ids"]
146
+ attention_mask = encodings["attention_mask"]
147
+ # attention_mask[input_ids == prebox_token_id] = 0
148
+ image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
149
+ image_start_index_list = [[x] for x in image_start_index_list]
150
+ image_nums = [1] * len(input_ids)
151
+ vision_x = batch_images.cuda()
152
+ lang_x = input_ids.cuda()
153
+ attention_mask = attention_mask.cuda()
154
+
155
+ model.debug_id = 0
156
+ with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
157
+ outputs = model(
158
+ vision_x=vision_x,
159
+ lang_x=lang_x,
160
+ attention_mask=attention_mask,
161
+ labels=None,
162
+ image_nums=image_nums,
163
+ image_start_index_list=image_start_index_list,
164
+ added_bbox_list=None,
165
+ add_box=False,
166
+ )
167
+ boxes = outputs["boxes"]
168
+ scores = outputs["scores"]
169
+ if len(scores) > 0:
170
+ box = boxes[scores.argmax()]
171
+ iou = get_iou(box, gt_box)
172
+ else:
173
+ iou = 0.0
174
+ # tqdm.write(f"output: {tokenizer.batch_decode(outputs)}")
175
+ tqdm.write(f"no output for: {uniq_id}, {image_id}, {text}")
176
+ if iou >= 0.5:
177
+ correct += 1
178
+ pbar.set_description(f"iou: {iou:.2f} score: {correct / total:.4f}")
179
+ # open_cv_image = np.array(image)
180
+ # # Convert RGB to BGR
181
+ # open_cv_image = open_cv_image[:, :, ::-1].copy()
182
+ # for box, score in zip(boxes, scores):
183
+ # open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2)
184
+ # cv2.imwrite("output.jpg", open_cv_image)
185
+ # print(boxes)
186
+ # print(scores)
187
+ # exit()
188
+
189
+
190
+ def generate(
191
+ idx,
192
+ image,
193
+ text,
194
+ vis_embed_size=256,
195
+ rank=0,
196
+ world_size=1,
197
+ ):
198
+ if image is None:
199
+ raise gr.Error("Please upload an image.")
200
+ flamingo.eval()
201
+ loc_token_ids = []
202
+ for i in range(1000):
203
+ loc_token_ids.append(int(tokenizer(f"<loc_{i}>", add_special_tokens=False)["input_ids"][-1]))
204
+ media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
205
+ endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
206
+ pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
207
+ bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
208
+ prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
209
+
210
+ image_ori = image
211
+ image = image.convert("RGB")
212
+ width = image.width
213
+ height = image.height
214
+ image = image.resize((224, 224))
215
+ batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
216
+ if idx == 1:
217
+ prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token * vis_embed_size}<|#endofimage#|><|#object#|>{text.rstrip('.').strip()}<|#endofobject#|><|#visual#|>"]
218
+ bad_words_ids = None
219
+ max_generation_length = 5
220
+ else:
221
+ prompt = [f"<|#image#|>{tokenizer.pad_token * vis_embed_size}<|#endofimage#|>{text.rstrip('.')}"]
222
+ bad_words_ids = loc_word_ids
223
+ max_generation_length = 30
224
+ encodings = tokenizer(
225
+ prompt,
226
+ padding="longest",
227
+ truncation=True,
228
+ return_tensors="pt",
229
+ max_length=2000,
230
+ )
231
+ input_ids = encodings["input_ids"]
232
+ attention_mask = encodings["attention_mask"]
233
+ image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
234
+ image_start_index_list = [[x] for x in image_start_index_list]
235
+ image_nums = [1] * len(input_ids)
236
+ outputs = get_outputs(
237
+ model=flamingo,
238
+ batch_images=batch_images,
239
+ attention_mask=attention_mask,
240
+ max_generation_length=max_generation_length,
241
+ min_generation_length=4,
242
+ num_beams=1,
243
+ length_penalty=1.0,
244
+ input_ids=input_ids,
245
+ bad_words_ids=bad_words_ids,
246
+ image_start_index_list=image_start_index_list,
247
+ image_nums=image_nums,
248
+ )
249
+ boxes = outputs["boxes"]
250
+ scores = outputs["scores"]
251
+ if len(scores) > 0:
252
+ box = boxes[scores.argmax()]
253
+ iou = get_iou(box, gt_box)
254
+ else:
255
+ iou = 0.0
256
+ # tqdm.write(f"output: {tokenizer.batch_decode(outputs)}")
257
+ tqdm.write(f"no output for: {uniq_id}, {image_id}, {text}")
258
+ if iou >= 0.5:
259
+ correct += 1
260
+
261
+
262
+ gen_text = tokenizer.batch_decode(outputs)
263
+ if idx == 1:
264
+ return f"Output:{gen_text}", out_image
265
+ elif idx == 2:
266
+ return (f"Question: {text.strip()} Answer: {gen_text}")
267
+ else:
268
+ return (f"Output:{gen_text}")
269
+
270
+
271
+ with gr.Blocks() as demo:
272
+ gr.Markdown(
273
+ """
274
+ 🍜 Object Centric Pretraining Demo
275
+ In this demo we showcase the in-context learning and grounding capabilities of the Object-Centric Pretrained model, a large multimodal model. Note that we add two additional demonstrations to the ones presented to improve the demo experience.
276
+ The model is trained on an interleaved mixture of text, images and bounding box and is able to generate text conditioned on sequences of images/text.
277
+ """
278
+ )
279
+
280
+ with gr.Accordion("See terms and conditions"):
281
+ gr.Markdown(
282
+ """**Please read the following information carefully before proceeding.**This demo does NOT store any personal information on its users, and it does NOT store user queries.""")
283
+
284
+ with gr.Tab("📷 Image Captioning"):
285
+ with gr.Row():
286
+
287
+
288
+ query_image = gr.Image(type="pil")
289
+ with gr.Row():
290
+ chat_input = gr.Textbox(lines=1, label="Chat Input")
291
+ text_output = gr.Textbox(value="Output:", label="Model output")
292
+
293
+ run_btn = gr.Button("Run model")
294
+
295
+
296
+
297
+ def on_click_fn(img,text): return generate(0, img, text)
298
+
299
+ run_btn.click(on_click_fn, inputs=[query_image,chat_input], outputs=[text_output])
300
+
301
+ with gr.Tab("🦓 Grounding"):
302
+ with gr.Row():
303
+ with gr.Column(scale=1):
304
+ query_image = gr.Image(type="pil")
305
+ with gr.Column(scale=1):
306
+ out_image = gr.Image(type="pil")
307
+ with gr.Row():
308
+ chat_input = gr.Textbox(lines=1, label="Chat Input")
309
+ text_output = gr.Textbox(value="Output:", label="Model output")
310
+
311
+ run_btn = gr.Button("Run model")
312
+
313
+
314
+ def on_click_fn(img, text): return generate(1, img, text)
315
+
316
+
317
+ run_btn.click(on_click_fn, inputs=[query_image, chat_input], outputs=[text_output, out_image])
318
+
319
+ with gr.Tab("🔢 Counting objects"):
320
+ with gr.Row():
321
+ query_image = gr.Image(type="pil")
322
+ with gr.Row():
323
+ chat_input = gr.Textbox(lines=1, label="Chat Input")
324
+ text_output = gr.Textbox(value="Output:", label="Model output")
325
+
326
+ run_btn = gr.Button("Run model")
327
+
328
+
329
+ def on_click_fn(img,text): return generate(0, img, text)
330
+
331
+
332
+ run_btn.click(on_click_fn, inputs=[query_image, chat_input], outputs=[text_output])
333
+
334
+ with gr.Tab("🕵️ Visual Question Answering"):
335
+ with gr.Row():
336
+ query_image = gr.Image(type="pil")
337
+ with gr.Row():
338
+ question = gr.Textbox(lines=1, label="Question")
339
+ text_output = gr.Textbox(value="Output:", label="Model output")
340
+
341
+ run_btn = gr.Button("Run model")
342
+
343
+
344
+ def on_click_fn(img, txt): return generate(2, img, txt)
345
+
346
+
347
+ run_btn.click(
348
+ on_click_fn, inputs=[query_image, question], outputs=[text_output]
349
+ )
350
+
351
+ with gr.Tab("🌎 Custom"):
352
+ gr.Markdown(
353
+ """### Customize the demonstration by uploading your own images and text samples.
354
+ ### **Note: Any text prompt you use will be prepended with an 'Output:', so you don't need to include it in your prompt.**"""
355
+ )
356
+ with gr.Row():
357
+ query_image = gr.Image(type="pil")
358
+ with gr.Row():
359
+ question = gr.Textbox(lines=1, label="Question")
360
+ text_output = gr.Textbox(value="Output:", label="Model output")
361
+
362
+ run_btn = gr.Button("Run model")
363
+
364
+
365
+ def on_click_fn(img, txt): return generate(2, img, txt)
366
+
367
+
368
+ run_btn.click(
369
+ on_click_fn, inputs=[query_image, question], outputs=[text_output]
370
+ )
371
+
372
+ demo.queue(concurrency_count=1)
373
+ demo.launch()
multimodal/HISTORY.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## 1.0.0
2
+
3
+ * it works
multimodal/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Anas Awadalla, Irena Gao, Joshua Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Jenia Jitsev, Simon Kornblith, Pang Wei Koh, Gabriel Ilharco, Mitchell Wortsman, Ludwig Schmidt.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
multimodal/MODEL_CARD.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ datasets:
4
+ - laion2b
5
+ ---
6
+
7
+ # OpenFlamingo-9B
8
+
9
+ [Blog post]() | [Code](https://github.com/mlfoundations/open_flamingo) | [Demo](https://7164d2142d11.ngrok.app)
10
+
11
+ OpenFlamingo is an open source implementation of DeepMind's [Flamingo](https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model) models.
12
+ OpenFlamingo-9B is built off of [CLIP ViT-L/14](https://huggingface.co/openai/clip-vit-large-patch14) and [LLaMA-7B](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/).
13
+
14
+
15
+ ## Model Details
16
+ We freeze the pretrained vision encoder and language model, and then we train connecting Perceiver modules and cross-attention layers, following the original Flamingo paper.
17
+
18
+ Our training data is a mixture of [LAION 2B](https://huggingface.co/datasets/laion/laion2B-en) and a large interleaved image-text dataset called Multimodal C4, which will be released soon.
19
+
20
+ The current model is an early checkpoint of an ongoing effort. This checkpoint has seen 5 million interleaved image-text examples from Multimodal C4 and 10 million samples from LAION 2B.
21
+
22
+ ## Uses
23
+ OpenFlamingo-9B is intended to be used **for academic research purposes only.** Commercial use is prohibited, in line with LLaMA's non-commercial license.
24
+
25
+ ### Bias, Risks, and Limitations
26
+ This model may generate inaccurate or offensive outputs, reflecting biases in its training data and pretrained priors.
27
+
28
+ In an effort to mitigate current potential biases and harms, we have deployed a text content filter on model outputs in the OpenFlamingo demo. We continue to red-team the model to understand and improve its safety.
29
+
30
+ ## Evaluation
31
+ We've evaluated this checkpoint on the validation sets for two vision-language tasks: COCO captioning and VQAv2. Results are displayed below.
32
+
33
+ **COCO (CIDEr)**
34
+
35
+ |0-shot|4-shot|8-shot|16-shot|32-shot|
36
+ |--|--|--|--|--|
37
+ |65.52|74.28|79.26|81.84|84.52|
38
+
39
+
40
+ **VQAv2 (VQA accuracy)**
41
+
42
+ |0-shot|4-shot|8-shot|16-shot|32-shot|
43
+ |---|---|---|---|---|
44
+ |43.55|44.05|47.5|48.87|50.34|
multimodal/Makefile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ install: ## [Local development] Upgrade pip, install requirements, install package.
2
+ python -m pip install -U pip
3
+ python -m pip install -e .
4
+
5
+ install-dev: ## [Local development] Install test requirements
6
+ python -m pip install -r requirements-test.txt
7
+
8
+ lint: ## [Local development] Run mypy, pylint and black
9
+ python -m mypy open_flamingo
10
+ python -m pylint open_flamingo
11
+ python -m black --check -l 120 open_flamingo
12
+
13
+ black: ## [Local development] Auto-format python code using black
14
+ python -m black -l 120 .
15
+
16
+ .PHONY: help
17
+
18
+ help: # Run `make help` to get help on the make commands
19
+ @grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
multimodal/README.md ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🦩 OpenFlamingo
2
+
3
+ [![PyPI version](https://badge.fury.io/py/open_flamingo.svg)](https://badge.fury.io/py/open_flamingo)
4
+
5
+ [Blog post](https://laion.ai/blog/open-flamingo/) | Paper (coming soon)
6
+
7
+ Welcome to our open source version of DeepMind's [Flamingo](https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model) model! In this repository, we provide a PyTorch implementation for training and evaluating OpenFlamingo models. We also provide an initial [OpenFlamingo 9B model](https://huggingface.co/openflamingo/OpenFlamingo-9B) trained on a new Multimodal C4 dataset (coming soon). Please refer to our blog post for more details.
8
+
9
+ This repo is still under development, and we hope to release better performing and larger OpenFlamingo models soon. If you have any questions, please feel free to open an issue. We also welcome contributions!
10
+
11
+ # Table of Contents
12
+ - [Installation](#installation)
13
+ - [Approach](#approach)
14
+ * [Model architecture](#model-architecture)
15
+ - [Usage](#usage)
16
+ * [Initializing an OpenFlamingo model](#initializing-an-openflamingo-model)
17
+ * [Generating text](#generating-text)
18
+ - [Training](#training)
19
+ * [Dataset](#dataset)
20
+ - [Evaluation](#evaluation)
21
+ - [Future plans](#future-plans)
22
+ - [Team](#team)
23
+ - [Acknowledgments](#acknowledgments)
24
+ - [Citing](#citing)
25
+
26
+ # Installation
27
+
28
+ To install the package in an existing environment, run
29
+ ```
30
+ pip install open-flamingo
31
+ ```
32
+
33
+ or to create a conda environment for running OpenFlamingo, run
34
+ ```
35
+ conda env create -f environment.yml
36
+ ```
37
+
38
+ # Usage
39
+ We provide an initial [OpenFlamingo 9B model](https://huggingface.co/openflamingo/OpenFlamingo-9B) using a CLIP ViT-Large vision encoder and a LLaMA-7B language model. In general, we support any [CLIP vision encoder](https://huggingface.co/models?search=clip). For the language model, we support [LLaMA](https://huggingface.co/models?search=llama), [OPT](https://huggingface.co/models?search=opt), [GPT-Neo](https://huggingface.co/models?search=gpt-neo), [GPT-J](https://huggingface.co/models?search=gptj), and [Pythia](https://huggingface.co/models?search=pythia) models.
40
+
41
+ #### NOTE: To use LLaMA models, you will need to install the latest version of transformers via
42
+ ```
43
+ pip install git+https://github.com/huggingface/transformers
44
+ ```
45
+ Use this [script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py) for converting LLaMA weights to HuggingFace format.
46
+
47
+ ## Initializing an OpenFlamingo model
48
+ ``` python
49
+ from open_flamingo import create_model_and_transforms
50
+
51
+ model, image_processor, tokenizer = create_model_and_transforms(
52
+ clip_vision_encoder_path="ViT-L-14",
53
+ clip_vision_encoder_pretrained="openai",
54
+ lang_encoder_path="<path to llama weights in HuggingFace format>",
55
+ tokenizer_path="<path to llama tokenizer in HuggingFace format>",
56
+ cross_attn_every_n_layers=4
57
+ )
58
+
59
+ # grab model checkpoint from huggingface hub
60
+ from huggingface_hub import hf_hub_download
61
+ import torch
62
+
63
+ checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B", "checkpoint.pt")
64
+ model.load_state_dict(torch.load(checkpoint_path), strict=False)
65
+ ```
66
+
67
+ ## Generating text
68
+ Here is an example of generating text conditioned on interleaved images/text, in this case we will do few-shot image captioning.
69
+
70
+ ``` python
71
+ from PIL import Image
72
+ import requests
73
+
74
+ """
75
+ Step 1: Load images
76
+ """
77
+ demo_image_one = Image.open(
78
+ requests.get(
79
+ "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True
80
+ ).raw
81
+ )
82
+
83
+ demo_image_two = Image.open(
84
+ requests.get(
85
+ "http://images.cocodataset.org/test-stuff2017/000000028137.jpg",
86
+ stream=True
87
+ ).raw
88
+ )
89
+
90
+ query_image = Image.open(
91
+ requests.get(
92
+ "http://images.cocodataset.org/test-stuff2017/000000028352.jpg",
93
+ stream=True
94
+ ).raw
95
+ )
96
+
97
+
98
+ """
99
+ Step 2: Preprocessing images
100
+ Details: For OpenFlamingo, we expect the image to be a torch tensor of shape
101
+ batch_size x num_media x num_frames x channels x height x width.
102
+ In this case batch_size = 1, num_media = 3, num_frames = 1
103
+ (this will always be one expect for video which we don't support yet),
104
+ channels = 3, height = 224, width = 224.
105
+ """
106
+ vision_x = [image_processor(demo_image_one).unsqueeze(0), image_processor(demo_image_two).unsqueeze(0), image_processor(query_image).unsqueeze(0)]
107
+ vision_x = torch.cat(vision_x, dim=0)
108
+ vision_x = vision_x.unsqueeze(1).unsqueeze(0)
109
+
110
+ """
111
+ Step 3: Preprocessing text
112
+ Details: In the text we expect an <|#image#|> special token to indicate where an image is.
113
+ We also expect an <|endofchunk|> special token to indicate the end of the text
114
+ portion associated with an image.
115
+ """
116
+ tokenizer.padding_side = "left" # For generation padding tokens should be on the left
117
+ lang_x = tokenizer(
118
+ ["<|#image#|>An image of two cats.<|endofchunk|><|#image#|>An image of a bathroom sink.<|endofchunk|><|#image#|>An image of"],
119
+ return_tensors="pt",
120
+ )
121
+
122
+
123
+ """
124
+ Step 4: Generate text
125
+ """
126
+ generated_text = model.generate(
127
+ vision_x=vision_x,
128
+ lang_x=lang_x["input_ids"],
129
+ attention_mask=lang_x["attention_mask"],
130
+ max_new_tokens=20,
131
+ num_beams=3,
132
+ )
133
+
134
+ print("Generated text: ", tokenizer.decode(generated_text[0]))
135
+ ```
136
+
137
+ # Approach
138
+ OpenFlamingo is a multimodal language model that can be used for a variety of tasks. It is trained on a large multimodal dataset (e.g. Multimodal C4) and can be used to generate text conditioned on interleaved images/text. For example, OpenFlamingo can be used to generate a caption for an image, or to generate a question given an image and a text passage. The benefit of this approach is that we are able to rapidly adapt to new tasks using in-context training.
139
+
140
+ ## Model architecture
141
+ OpenFlamingo seeks to fuse a pretrained vision encoder and a language model using cross attention layers. The model architecture is shown below.
142
+
143
+ ![OpenFlamingo architecture](docs/flamingo.png)
144
+ Credit: [Flamingo](https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model)
145
+
146
+ # Training
147
+ To train a model, modify the following example command, which uses OPT 1.3B as an example LM:
148
+ ```
149
+ torchrun --nnodes=1 --nproc_per_node=4 train.py \
150
+ --run_name flamingo3B \
151
+ --lm_path facebook/opt-1.3b \
152
+ --tokenizer_path facebook/opt-1.3b \
153
+ --dataset_resampled \
154
+ --laion_shards "/path/to/shards/shard-{0000..0999}.tar" \
155
+ --mmc4_shards "/path/to/shards/shard-{0000..0999}.tar" \
156
+ --batch_size_mmc4 4 \
157
+ --batch_size_laion 8 \
158
+ --train_num_samples_mmc4 125000 \
159
+ --train_num_samples_laion 250000 \
160
+ --loss_multiplier_laion 0.2 \
161
+ --workers=6 \
162
+ --num_epochs 250 \
163
+ --lr_scheduler constant \
164
+ --warmup_steps 5000 \
165
+ --use_media_placement_augmentation \
166
+ --mmc4_textsim_threshold 30
167
+ ```
168
+
169
+ ## Dataset
170
+ We expect all our training datasets to be [WebDataset](https://github.com/webdataset/webdataset) shards.
171
+ We train our models on the [LAION 2B](https://huggingface.co/datasets/laion/laion2B-en) and Multimodal C4 (coming soon) datasets. By default the LAION 2B dataset is in WebDataset format if it is downloaded using the [img2dataset tool](https://github.com/rom1504/img2dataset) and Multimodal C4 comes packaged in the WebDataset format.
172
+
173
+
174
+ # Evaluation
175
+ We currently support running evaluations on [COCO](https://cocodataset.org/#home), [VQAv2](https://visualqa.org/index.html), [OKVQA](https://okvqa.allenai.org), [Flickr30k](https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset), and [ImageNet](https://image-net.org/index.php). Note that currently these evaluations are ran in validation mode (as specified in the Flamingo paper). We will be adding support for running evaluations in test mode in the future.
176
+
177
+ Before evaluating the model, you will need to install the coco evaluation package by running the following command:
178
+ ```
179
+ pip install pycocoevalcap
180
+ ```
181
+
182
+ To run evaluations on OKVQA you will need to run the following command:
183
+ ```
184
+ import nltk
185
+ nltk.download('wordnet')
186
+ ```
187
+
188
+ To evaluate the model, run the script at `open_flamingo/scripts/run_eval.sh`
189
+
190
+ # Future plans
191
+ - [ ] Add support for video input
192
+ - [ ] Release better performing and larger OpenFlamingo models
193
+ - [ ] Expand our evaluation suite
194
+ - [ ] Add support for FSDP training
195
+
196
+ # Team
197
+
198
+ OpenFlamingo is developed by:
199
+
200
+ [Anas Awadalla](https://anas-awadalla.streamlit.app/), [Irena Gao](https://i-gao.github.io/), [Joshua Gardner](https://homes.cs.washington.edu/~jpgard/), [Jack Hessel](https://jmhessel.com/), [Yusuf Hanafy](https://www.linkedin.com/in/yusufhanafy/), [Wanrong Zhu](https://wanrong-zhu.com/), [Kalyani Marathe](https://sites.google.com/uw.edu/kalyanimarathe/home?authuser=0), [Yonatan Bitton](https://yonatanbitton.github.io/), [Samir Gadre](https://sagadre.github.io/), [Jenia Jitsev](https://scholar.google.de/citations?user=p1FuAMkAAAAJ&hl=en), [Simon Kornblith](https://simonster.com/), [Pang Wei Koh](https://koh.pw/), [Gabriel Ilharco](https://gabrielilharco.com/), [Mitchell Wortsman](https://mitchellnw.github.io/), [Ludwig Schmidt](https://people.csail.mit.edu/ludwigs/).
201
+
202
+ The team is primarily from the University of Washington, Stanford, AI2, UCSB, and Google.
203
+
204
+ # Acknowledgments
205
+ This code is based on Lucidrains' [flamingo implementation](https://github.com/lucidrains/flamingo-pytorch) and David Hansmair's [flamingo-mini repo](https://github.com/dhansmair/flamingo-mini). Thank you for making your code public! We also thank the [OpenCLIP](https://github.com/mlfoundations/open_clip) team as we use their data loading code and take inspiration from their library design.
206
+
207
+ We would also like to thank [Jean-Baptiste Alayrac](https://www.jbalayrac.com) and [Antoine Miech](https://antoine77340.github.io) for their advice, [Rohan Taori](https://www.rohantaori.com/), [Nicholas Schiefer](https://nicholasschiefer.com/), [Deep Ganguli](https://hai.stanford.edu/people/deep-ganguli), [Thomas Liao](https://thomasliao.com/), [Tatsunori Hashimoto](https://thashim.github.io/), and [Nicholas Carlini](https://nicholas.carlini.com/) for their help with assessing the safety risks of our release, and to [Stability AI](https://stability.ai) for providing us with compute resources to train these models.
208
+
209
+ # Citing
210
+ If you found this repository useful, please consider citing:
211
+
212
+ ```
213
+ @software{anas_awadalla_2023_7733589,
214
+ author = {Awadalla, Anas and Gao, Irena and Gardner, Joshua and Hessel, Jack and Hanafy, Yusuf and Zhu, Wanrong and Marathe, Kalyani and Bitton, Yonatan and Gadre, Samir and Jitsev, Jenia and Kornblith, Simon and Koh, Pang Wei and Ilharco, Gabriel and Wortsman, Mitchell and Schmidt, Ludwig},
215
+ title = {OpenFlamingo},
216
+ month = mar,
217
+ year = 2023,
218
+ publisher = {Zenodo},
219
+ version = {v0.1.1},
220
+ doi = {10.5281/zenodo.7733589},
221
+ url = {https://doi.org/10.5281/zenodo.7733589}
222
+ }
223
+ ```
224
+
225
+ ```
226
+ @article{Alayrac2022FlamingoAV,
227
+ title={Flamingo: a Visual Language Model for Few-Shot Learning},
228
+ author={Jean-Baptiste Alayrac and Jeff Donahue and Pauline Luc and Antoine Miech and Iain Barr and Yana Hasson and Karel Lenc and Arthur Mensch and Katie Millican and Malcolm Reynolds and Roman Ring and Eliza Rutherford and Serkan Cabi and Tengda Han and Zhitao Gong and Sina Samangooei and Marianne Monteiro and Jacob Menick and Sebastian Borgeaud and Andy Brock and Aida Nematzadeh and Sahand Sharifzadeh and Mikolaj Binkowski and Ricardo Barreira and Oriol Vinyals and Andrew Zisserman and Karen Simonyan},
229
+ journal={ArXiv},
230
+ year={2022},
231
+ volume={abs/2204.14198}
232
+ }
233
+ ```
multimodal/YOLOX/.gitignore ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Linux ###
2
+ *~
3
+
4
+ # user experiments directory
5
+ YOLOX_outputs/
6
+ datasets/
7
+ # do not ignore datasets under yolox/data
8
+ !*yolox/data/datasets/
9
+
10
+ # temporary files which can be created if a process still has a handle open of a deleted file
11
+ .fuse_hidden*
12
+
13
+ # KDE directory preferences
14
+ .directory
15
+
16
+ # Linux trash folder which might appear on any partition or disk
17
+ .Trash-*
18
+
19
+ # .nfs files are created when an open file is removed but is still being accessed
20
+ .nfs*
21
+
22
+ ### PyCharm ###
23
+ # User-specific stuff
24
+ .idea
25
+
26
+ # CMake
27
+ cmake-build-*/
28
+
29
+ # Mongo Explorer plugin
30
+ .idea/**/mongoSettings.xml
31
+
32
+ # File-based project format
33
+ *.iws
34
+
35
+ # IntelliJ
36
+ out/
37
+
38
+ # mpeltonen/sbt-idea plugin
39
+ .idea_modules/
40
+
41
+ # JIRA plugin
42
+ atlassian-ide-plugin.xml
43
+
44
+ # Cursive Clojure plugin
45
+ .idea/replstate.xml
46
+
47
+ # Crashlytics plugin (for Android Studio and IntelliJ)
48
+ com_crashlytics_export_strings.xml
49
+ crashlytics.properties
50
+ crashlytics-build.properties
51
+ fabric.properties
52
+
53
+ # Editor-based Rest Client
54
+ .idea/httpRequests
55
+
56
+ # Android studio 3.1+ serialized cache file
57
+ .idea/caches/build_file_checksums.ser
58
+
59
+ # JetBrains templates
60
+ **___jb_tmp___
61
+
62
+ ### Python ###
63
+ # Byte-compiled / optimized / DLL files
64
+ __pycache__/
65
+ *.py[cod]
66
+ *$py.class
67
+
68
+ # C extensions
69
+ *.so
70
+
71
+ # Distribution / packaging
72
+ .Python
73
+ build/
74
+ develop-eggs/
75
+ dist/
76
+ downloads/
77
+ eggs/
78
+ .eggs/
79
+ lib/
80
+ lib64/
81
+ parts/
82
+ sdist/
83
+ var/
84
+ wheels/
85
+ pip-wheel-metadata/
86
+ share/python-wheels/
87
+ *.egg-info/
88
+ .installed.cfg
89
+ *.egg
90
+ MANIFEST
91
+
92
+ # PyInstaller
93
+ # Usually these files are written by a python script from a template
94
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
95
+ *.manifest
96
+ *.spec
97
+
98
+ # Installer logs
99
+ pip-log.txt
100
+ pip-delete-this-directory.txt
101
+
102
+ # Unit test / coverage reports
103
+ htmlcov/
104
+ .tox/
105
+ .nox/
106
+ .coverage
107
+ .coverage.*
108
+ .cache
109
+ nosetests.xml
110
+ coverage.xml
111
+ *.cover
112
+ .hypothesis/
113
+ .pytest_cache/
114
+
115
+ # Translations
116
+ *.mo
117
+ *.pot
118
+
119
+ # Django stuff:
120
+ *.log
121
+ local_settings.py
122
+ db.sqlite3
123
+
124
+ # Flask stuff:
125
+ instance/
126
+ .webassets-cache
127
+
128
+ # Scrapy stuff:
129
+ .scrapy
130
+
131
+ # Sphinx documentation
132
+ docs/_build/
133
+ docs/build/
134
+
135
+ # PyBuilder
136
+ target/
137
+
138
+ # Jupyter Notebook
139
+ .ipynb_checkpoints
140
+
141
+ # IPython
142
+ profile_default/
143
+ ipython_config.py
144
+
145
+ # pyenv
146
+ .python-version
147
+
148
+ # pipenv
149
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
150
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
151
+ # having no cross-platform support, pipenv may install dependencies that don’t work, or not
152
+ # install all needed dependencies.
153
+ #Pipfile.lock
154
+
155
+ # celery beat schedule file
156
+ celerybeat-schedule
157
+
158
+ # SageMath parsed files
159
+ *.sage.py
160
+
161
+ # Environments
162
+ .env
163
+ .venv
164
+ env/
165
+ venv/
166
+ ENV/
167
+ env.bak/
168
+ venv.bak/
169
+
170
+ # Spyder project settings
171
+ .spyderproject
172
+ .spyproject
173
+
174
+ # Rope project settings
175
+ .ropeproject
176
+
177
+ # mkdocs documentation
178
+ /site
179
+
180
+ # mypy
181
+ .mypy_cache/
182
+ .dmypy.json
183
+ dmypy.json
184
+
185
+ # Pyre type checker
186
+ .pyre/
187
+
188
+ ### Vim ###
189
+ # Swap
190
+ [._]*.s[a-v][a-z]
191
+ [._]*.sw[a-p]
192
+ [._]s[a-rt-v][a-z]
193
+ [._]ss[a-gi-z]
194
+ [._]sw[a-p]
195
+
196
+ # Session
197
+ Session.vim
198
+
199
+ # Temporary
200
+ .netrwhist
201
+ # Auto-generated tag files
202
+ tags
203
+ # Persistent undo
204
+ [._]*.un~
205
+
206
+ # output
207
+ docs/api
208
+ .code-workspace.code-workspace
209
+ *.pkl
210
+ *.npy
211
+ *.pth
212
+ *.onnx
213
+ *.engine
214
+ events.out.tfevents*
215
+
216
+ # vscode
217
+ *.code-workspace
218
+ .vscode
219
+
220
+ # vim
221
+ .vim
222
+
223
+ # OS generated files
224
+ .DS_Store
225
+ .DS_Store?
226
+ .Trashes
227
+ ehthumbs.db
228
+ Thumbs.db
multimodal/YOLOX/.pre-commit-config.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pycqa/flake8
3
+ rev: 3.8.3
4
+ hooks:
5
+ - id: flake8
6
+ - repo: https://github.com/pre-commit/pre-commit-hooks
7
+ rev: v3.1.0
8
+ hooks:
9
+ - id: check-added-large-files
10
+ - id: check-docstring-first
11
+ - id: check-executables-have-shebangs
12
+ - id: check-json
13
+ - id: check-yaml
14
+ args: ["--unsafe"]
15
+ - id: debug-statements
16
+ - id: end-of-file-fixer
17
+ - id: requirements-txt-fixer
18
+ - id: trailing-whitespace
19
+ - repo: https://github.com/jorisroovers/gitlint
20
+ rev: v0.15.1
21
+ hooks:
22
+ - id: gitlint
23
+ - repo: https://github.com/pycqa/isort
24
+ rev: 4.3.21
25
+ hooks:
26
+ - id: isort
27
+
28
+ - repo: https://github.com/PyCQA/autoflake
29
+ rev: v1.4
30
+ hooks:
31
+ - id: autoflake
32
+ name: Remove unused variables and imports
33
+ entry: autoflake
34
+ language: python
35
+ args:
36
+ [
37
+ "--in-place",
38
+ "--remove-all-unused-imports",
39
+ "--remove-unused-variables",
40
+ "--expand-star-imports",
41
+ "--ignore-init-module-imports",
42
+ ]
43
+ files: \.py$
multimodal/YOLOX/.readthedocs.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .readthedocs.yaml
2
+ # Read the Docs configuration file
3
+ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4
+
5
+ # Required
6
+ version: 2
7
+
8
+ # Build documentation in the docs/ directory with Sphinx
9
+ sphinx:
10
+ configuration: docs/conf.py
11
+
12
+ # Optionally build your docs in additional formats such as PDF
13
+ formats:
14
+ - pdf
15
+
16
+ # Optionally set the version of Python and requirements required to build your docs
17
+ python:
18
+ version: "3.7"
19
+ install:
20
+ - requirements: docs/requirements-doc.txt
21
+ - requirements: requirements.txt
multimodal/YOLOX/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "{}"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
multimodal/YOLOX/MANIFEST.in ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ include requirements.txt
2
+ recursive-include yolox *.cpp *.h *.cu *.cuh *.cc
multimodal/YOLOX/README.md ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center"><img src="assets/logo.png" width="350"></div>
2
+ <img src="assets/demo.png" >
3
+
4
+ ## Introduction
5
+ YOLOX is an anchor-free version of YOLO, with a simpler design but better performance! It aims to bridge the gap between research and industrial communities.
6
+ For more details, please refer to our [report on Arxiv](https://arxiv.org/abs/2107.08430).
7
+
8
+ This repo is an implementation of PyTorch version YOLOX, there is also a [MegEngine implementation](https://github.com/MegEngine/YOLOX).
9
+
10
+ <img src="assets/git_fig.png" width="1000" >
11
+
12
+ ## Updates!!
13
+ * 【2023/02/28】 We support assignment visualization tool, see doc [here](./docs/assignment_visualization.md).
14
+ * 【2022/04/14】 We support jit compile op.
15
+ * 【2021/08/19】 We optimize the training process with **2x** faster training and **~1%** higher performance! See [notes](docs/updates_note.md) for more details.
16
+ * 【2021/08/05】 We release [MegEngine version YOLOX](https://github.com/MegEngine/YOLOX).
17
+ * 【2021/07/28】 We fix the fatal error of [memory leak](https://github.com/Megvii-BaseDetection/YOLOX/issues/103)
18
+ * 【2021/07/26】 We now support [MegEngine](https://github.com/Megvii-BaseDetection/YOLOX/tree/main/demo/MegEngine) deployment.
19
+ * 【2021/07/20】 We have released our technical report on [Arxiv](https://arxiv.org/abs/2107.08430).
20
+
21
+ ## Coming soon
22
+ - [ ] YOLOX-P6 and larger model.
23
+ - [ ] Objects365 pretrain.
24
+ - [ ] Transformer modules.
25
+ - [ ] More features in need.
26
+
27
+ ## Benchmark
28
+
29
+ #### Standard Models.
30
+
31
+ |Model |size |mAP<sup>val<br>0.5:0.95 |mAP<sup>test<br>0.5:0.95 | Speed V100<br>(ms) | Params<br>(M) |FLOPs<br>(G)| weights |
32
+ | ------ |:---: | :---: | :---: |:---: |:---: | :---: | :----: |
33
+ |[YOLOX-s](./exps/default/yolox_s.py) |640 |40.5 |40.5 |9.8 |9.0 | 26.8 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s.pth) |
34
+ |[YOLOX-m](./exps/default/yolox_m.py) |640 |46.9 |47.2 |12.3 |25.3 |73.8| [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_m.pth) |
35
+ |[YOLOX-l](./exps/default/yolox_l.py) |640 |49.7 |50.1 |14.5 |54.2| 155.6 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_l.pth) |
36
+ |[YOLOX-x](./exps/default/yolox_x.py) |640 |51.1 |**51.5** | 17.3 |99.1 |281.9 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_x.pth) |
37
+ |[YOLOX-Darknet53](./exps/default/yolov3.py) |640 | 47.7 | 48.0 | 11.1 |63.7 | 185.3 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_darknet.pth) |
38
+
39
+ <details>
40
+ <summary>Legacy models</summary>
41
+
42
+ |Model |size |mAP<sup>test<br>0.5:0.95 | Speed V100<br>(ms) | Params<br>(M) |FLOPs<br>(G)| weights |
43
+ | ------ |:---: | :---: |:---: |:---: | :---: | :----: |
44
+ |[YOLOX-s](./exps/default/yolox_s.py) |640 |39.6 |9.8 |9.0 | 26.8 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EW62gmO2vnNNs5npxjzunVwB9p307qqygaCkXdTO88BLUg?e=NMTQYw)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s.pth) |
45
+ |[YOLOX-m](./exps/default/yolox_m.py) |640 |46.4 |12.3 |25.3 |73.8| [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ERMTP7VFqrVBrXKMU7Vl4TcBQs0SUeCT7kvc-JdIbej4tQ?e=1MDo9y)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_m.pth) |
46
+ |[YOLOX-l](./exps/default/yolox_l.py) |640 |50.0 |14.5 |54.2| 155.6 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EWA8w_IEOzBKvuueBqfaZh0BeoG5sVzR-XYbOJO4YlOkRw?e=wHWOBE)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_l.pth) |
47
+ |[YOLOX-x](./exps/default/yolox_x.py) |640 |**51.2** | 17.3 |99.1 |281.9 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EdgVPHBziOVBtGAXHfeHI5kBza0q9yyueMGdT0wXZfI1rQ?e=tABO5u)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_x.pth) |
48
+ |[YOLOX-Darknet53](./exps/default/yolov3.py) |640 | 47.4 | 11.1 |63.7 | 185.3 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EZ-MV1r_fMFPkPrNjvbJEMoBLOLAnXH-XKEB77w8LhXL6Q?e=mf6wOc)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_darknet53.pth) |
49
+
50
+ </details>
51
+
52
+ #### Light Models.
53
+
54
+ |Model |size |mAP<sup>val<br>0.5:0.95 | Params<br>(M) |FLOPs<br>(G)| weights |
55
+ | ------ |:---: | :---: |:---: |:---: | :---: |
56
+ |[YOLOX-Nano](./exps/default/yolox_nano.py) |416 |25.8 | 0.91 |1.08 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_nano.pth) |
57
+ |[YOLOX-Tiny](./exps/default/yolox_tiny.py) |416 |32.8 | 5.06 |6.45 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_tiny.pth) |
58
+
59
+
60
+ <details>
61
+ <summary>Legacy models</summary>
62
+
63
+ |Model |size |mAP<sup>val<br>0.5:0.95 | Params<br>(M) |FLOPs<br>(G)| weights |
64
+ | ------ |:---: | :---: |:---: |:---: | :---: |
65
+ |[YOLOX-Nano](./exps/default/yolox_nano.py) |416 |25.3 | 0.91 |1.08 | [github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_nano.pth) |
66
+ |[YOLOX-Tiny](./exps/default/yolox_tiny.py) |416 |32.8 | 5.06 |6.45 | [github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_tiny_32dot8.pth) |
67
+
68
+ </details>
69
+
70
+ ## Quick Start
71
+
72
+ <details>
73
+ <summary>Installation</summary>
74
+
75
+ Step1. Install YOLOX from source.
76
+ ```shell
77
+ git clone git@github.com:Megvii-BaseDetection/YOLOX.git
78
+ cd YOLOX
79
+ pip3 install -v -e . # or python3 setup.py develop
80
+ ```
81
+
82
+ </details>
83
+
84
+ <details>
85
+ <summary>Demo</summary>
86
+
87
+ Step1. Download a pretrained model from the benchmark table.
88
+
89
+ Step2. Use either -n or -f to specify your detector's config. For example:
90
+
91
+ ```shell
92
+ python tools/demo.py image -n yolox-s -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
93
+ ```
94
+ or
95
+ ```shell
96
+ python tools/demo.py image -f exps/default/yolox_s.py -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
97
+ ```
98
+ Demo for video:
99
+ ```shell
100
+ python tools/demo.py video -n yolox-s -c /path/to/your/yolox_s.pth --path /path/to/your/video --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
101
+ ```
102
+
103
+
104
+ </details>
105
+
106
+ <details>
107
+ <summary>Reproduce our results on COCO</summary>
108
+
109
+ Step1. Prepare COCO dataset
110
+ ```shell
111
+ cd <YOLOX_HOME>
112
+ ln -s /path/to/your/COCO ./datasets/COCO
113
+ ```
114
+
115
+ Step2. Reproduce our results on COCO by specifying -n:
116
+
117
+ ```shell
118
+ python -m yolox.tools.train -n yolox-s -d 8 -b 64 --fp16 -o [--cache]
119
+ yolox-m
120
+ yolox-l
121
+ yolox-x
122
+ ```
123
+ * -d: number of gpu devices
124
+ * -b: total batch size, the recommended number for -b is num-gpu * 8
125
+ * --fp16: mixed precision training
126
+ * --cache: caching imgs into RAM to accelarate training, which need large system RAM.
127
+
128
+
129
+
130
+ When using -f, the above commands are equivalent to:
131
+ ```shell
132
+ python -m yolox.tools.train -f exps/default/yolox_s.py -d 8 -b 64 --fp16 -o [--cache]
133
+ exps/default/yolox_m.py
134
+ exps/default/yolox_l.py
135
+ exps/default/yolox_x.py
136
+ ```
137
+
138
+ **Multi Machine Training**
139
+
140
+ We also support multi-nodes training. Just add the following args:
141
+ * --num\_machines: num of your total training nodes
142
+ * --machine\_rank: specify the rank of each node
143
+
144
+ Suppose you want to train YOLOX on 2 machines, and your master machines's IP is 123.123.123.123, use port 12312 and TCP.
145
+
146
+ On master machine, run
147
+ ```shell
148
+ python tools/train.py -n yolox-s -b 128 --dist-url tcp://123.123.123.123:12312 --num_machines 2 --machine_rank 0
149
+ ```
150
+ On the second machine, run
151
+ ```shell
152
+ python tools/train.py -n yolox-s -b 128 --dist-url tcp://123.123.123.123:12312 --num_machines 2 --machine_rank 1
153
+ ```
154
+
155
+ **Logging to Weights & Biases**
156
+
157
+ To log metrics, predictions and model checkpoints to [W&B](https://docs.wandb.ai/guides/integrations/other/yolox) use the command line argument `--logger wandb` and use the prefix "wandb-" to specify arguments for initializing the wandb run.
158
+
159
+ ```shell
160
+ python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o [--cache] --logger wandb wandb-project <project name>
161
+ yolox-m
162
+ yolox-l
163
+ yolox-x
164
+ ```
165
+
166
+ An example wandb dashboard is available [here](https://wandb.ai/manan-goel/yolox-nano/runs/3pzfeom0)
167
+
168
+ **Others**
169
+
170
+ See more information with the following command:
171
+ ```shell
172
+ python -m yolox.tools.train --help
173
+ ```
174
+
175
+ </details>
176
+
177
+
178
+ <details>
179
+ <summary>Evaluation</summary>
180
+
181
+ We support batch testing for fast evaluation:
182
+
183
+ ```shell
184
+ python -m yolox.tools.eval -n yolox-s -c yolox_s.pth -b 64 -d 8 --conf 0.001 [--fp16] [--fuse]
185
+ yolox-m
186
+ yolox-l
187
+ yolox-x
188
+ ```
189
+ * --fuse: fuse conv and bn
190
+ * -d: number of GPUs used for evaluation. DEFAULT: All GPUs available will be used.
191
+ * -b: total batch size across on all GPUs
192
+
193
+ To reproduce speed test, we use the following command:
194
+ ```shell
195
+ python -m yolox.tools.eval -n yolox-s -c yolox_s.pth -b 1 -d 1 --conf 0.001 --fp16 --fuse
196
+ yolox-m
197
+ yolox-l
198
+ yolox-x
199
+ ```
200
+
201
+ </details>
202
+
203
+
204
+ <details>
205
+ <summary>Tutorials</summary>
206
+
207
+ * [Training on custom data](docs/train_custom_data.md)
208
+ * [Caching for custom data](docs/cache.md)
209
+ * [Manipulating training image size](docs/manipulate_training_image_size.md)
210
+ * [Assignment visualization](docs/assignment_visualization.md)
211
+ * [Freezing model](docs/freeze_module.md)
212
+
213
+ </details>
214
+
215
+ ## Deployment
216
+
217
+
218
+ 1. [MegEngine in C++ and Python](./demo/MegEngine)
219
+ 2. [ONNX export and an ONNXRuntime](./demo/ONNXRuntime)
220
+ 3. [TensorRT in C++ and Python](./demo/TensorRT)
221
+ 4. [ncnn in C++ and Java](./demo/ncnn)
222
+ 5. [OpenVINO in C++ and Python](./demo/OpenVINO)
223
+ 6. [Accelerate YOLOX inference with nebullvm in Python](./demo/nebullvm)
224
+
225
+ ## Third-party resources
226
+ * YOLOX for streaming perception: [StreamYOLO (CVPR 2022 Oral)](https://github.com/yancie-yjr/StreamYOLO)
227
+ * The YOLOX-s and YOLOX-nano are Integrated into [ModelScope](https://www.modelscope.cn/home). Try out the Online Demo at [YOLOX-s](https://www.modelscope.cn/models/damo/cv_cspnet_image-object-detection_yolox/summary) and [YOLOX-Nano](https://www.modelscope.cn/models/damo/cv_cspnet_image-object-detection_yolox_nano_coco/summary) respectively 🚀.
228
+ * Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces) using [Gradio](https://github.com/gradio-app/gradio). Try out the Web Demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Sultannn/YOLOX-Demo)
229
+ * The ncnn android app with video support: [ncnn-android-yolox](https://github.com/FeiGeChuanShu/ncnn-android-yolox) from [FeiGeChuanShu](https://github.com/FeiGeChuanShu)
230
+ * YOLOX with Tengine support: [Tengine](https://github.com/OAID/Tengine/blob/tengine-lite/examples/tm_yolox.cpp) from [BUG1989](https://github.com/BUG1989)
231
+ * YOLOX + ROS2 Foxy: [YOLOX-ROS](https://github.com/Ar-Ray-code/YOLOX-ROS) from [Ar-Ray](https://github.com/Ar-Ray-code)
232
+ * YOLOX Deploy DeepStream: [YOLOX-deepstream](https://github.com/nanmi/YOLOX-deepstream) from [nanmi](https://github.com/nanmi)
233
+ * YOLOX MNN/TNN/ONNXRuntime: [YOLOX-MNN](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/mnn/cv/mnn_yolox.cpp)、[YOLOX-TNN](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/tnn/cv/tnn_yolox.cpp) and [YOLOX-ONNXRuntime C++](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/ort/cv/yolox.cpp) from [DefTruth](https://github.com/DefTruth)
234
+ * Converting darknet or yolov5 datasets to COCO format for YOLOX: [YOLO2COCO](https://github.com/RapidAI/YOLO2COCO) from [Daniel](https://github.com/znsoftm)
235
+
236
+ ## Cite YOLOX
237
+ If you use YOLOX in your research, please cite our work by using the following BibTeX entry:
238
+
239
+ ```latex
240
+ @article{yolox2021,
241
+ title={YOLOX: Exceeding YOLO Series in 2021},
242
+ author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian},
243
+ journal={arXiv preprint arXiv:2107.08430},
244
+ year={2021}
245
+ }
246
+ ```
247
+ ## In memory of Dr. Jian Sun
248
+ Without the guidance of [Dr. Jian Sun](http://www.jiansun.org/), YOLOX would not have been released and open sourced to the community.
249
+ The passing away of Dr. Jian is a huge loss to the Computer Vision field. We add this section here to express our remembrance and condolences to our captain Dr. Jian.
250
+ It is hoped that every AI practitioner in the world will stick to the concept of "continuous innovation to expand cognitive boundaries, and extraordinary technology to achieve product value" and move forward all the way.
251
+
252
+ <div align="center"><img src="assets/sunjian.png" width="200"></div>
253
+ 没有孙剑博士的指导,YOLOX也不会问世并开源给社区使用。
254
+ 孙剑博士的离去是CV领域的一大损失,我们在此特别添加了这个部分来表达对我们的“船长”孙老师的纪念和哀思。
255
+ 希望世界上的每个AI从业者秉持着“持续创新拓展认知边界,非凡科技成就产品价值”的观念,一路向前。
multimodal/YOLOX/demo/MegEngine/cpp/README.md ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # YOLOX-CPP-MegEngine
2
+
3
+ Cpp file compile of YOLOX object detection base on [MegEngine](https://github.com/MegEngine/MegEngine).
4
+
5
+ ## Tutorial
6
+
7
+ ### Step1: install toolchain
8
+
9
+ * host: sudo apt install gcc/g++ (gcc/g++, which version >= 6) build-essential git git-lfs gfortran libgfortran-6-dev autoconf gnupg flex bison gperf curl zlib1g-dev gcc-multilib g++-multilib cmake
10
+ * cross build android: download [NDK](https://developer.android.com/ndk/downloads)
11
+ * after unzip download NDK, then export NDK_ROOT="path of NDK"
12
+
13
+ ### Step2: build MegEngine
14
+
15
+ ```shell
16
+ git clone https://github.com/MegEngine/MegEngine.git
17
+
18
+ # then init third_party
19
+
20
+ export megengine_root="path of MegEngine"
21
+ cd $megengine_root && ./third_party/prepare.sh && ./third_party/install-mkl.sh
22
+
23
+ # build example:
24
+ # build host without cuda:
25
+ ./scripts/cmake-build/host_build.sh
26
+ # or build host with cuda:
27
+ ./scripts/cmake-build/host_build.sh -c
28
+ # or cross build for android aarch64:
29
+ ./scripts/cmake-build/cross_build_android_arm_inference.sh
30
+ # or cross build for android aarch64(with V8.2+fp16):
31
+ ./scripts/cmake-build/cross_build_android_arm_inference.sh -f
32
+
33
+ # after build MegEngine, you need export the `MGE_INSTALL_PATH`
34
+ # host without cuda:
35
+ export MGE_INSTALL_PATH=${megengine_root}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_ON/Release/install
36
+ # or host with cuda:
37
+ export MGE_INSTALL_PATH=${megengine_root}/build_dir/host/MGE_WITH_CUDA_ON/MGE_INFERENCE_ONLY_ON/Release/install
38
+ # or cross build for android aarch64:
39
+ export MGE_INSTALL_PATH=${megengine_root}/build_dir/android/arm64-v8a/Release/install
40
+ ```
41
+ * you can refs [build tutorial of MegEngine](https://github.com/MegEngine/MegEngine/blob/master/scripts/cmake-build/BUILD_README.md) to build other platform, eg, windows/macos/ etc!
42
+
43
+ ### Step3: build OpenCV
44
+
45
+ ```shell
46
+ git clone https://github.com/opencv/opencv.git
47
+
48
+ git checkout 3.4.15 (we test at 3.4.15, if test other version, may need modify some build)
49
+ ```
50
+
51
+ - patch diff for android:
52
+
53
+ ```
54
+ # ```
55
+ # diff --git a/CMakeLists.txt b/CMakeLists.txt
56
+ # index f6a2da5310..10354312c9 100644
57
+ # --- a/CMakeLists.txt
58
+ # +++ b/CMakeLists.txt
59
+ # @@ -643,7 +643,7 @@ if(UNIX)
60
+ # if(NOT APPLE)
61
+ # CHECK_INCLUDE_FILE(pthread.h HAVE_PTHREAD)
62
+ # if(ANDROID)
63
+ # - set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} dl m log)
64
+ # + set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} dl m log z)
65
+ # elseif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD|NetBSD|DragonFly|OpenBSD|Haiku")
66
+ # set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} m pthread)
67
+ # elseif(EMSCRIPTEN)
68
+
69
+ # ```
70
+ ```
71
+
72
+ - build for host
73
+
74
+ ```shell
75
+ cd root_dir_of_opencv
76
+ mkdir -p build/install
77
+ cd build
78
+ cmake -DBUILD_JAVA=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=$PWD/install
79
+ make install -j32
80
+ ```
81
+
82
+ * build for android-aarch64
83
+
84
+ ```shell
85
+ cd root_dir_of_opencv
86
+ mkdir -p build_android/install
87
+ cd build_android
88
+
89
+ cmake -DCMAKE_TOOLCHAIN_FILE="$NDK_ROOT/build/cmake/android.toolchain.cmake" -DANDROID_NDK="$NDK_ROOT" -DANDROID_ABI=arm64-v8a -DANDROID_NATIVE_API_LEVEL=21 -DBUILD_JAVA=OFF -DBUILD_ANDROID_PROJECTS=OFF -DBUILD_ANDROID_EXAMPLES=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=$PWD/install ..
90
+
91
+ make install -j32
92
+ ```
93
+
94
+ * after build OpenCV, you need export `OPENCV_INSTALL_INCLUDE_PATH ` and `OPENCV_INSTALL_LIB_PATH`
95
+
96
+ ```shell
97
+ # host build:
98
+ export OPENCV_INSTALL_INCLUDE_PATH=${path of opencv}/build/install/include
99
+ export OPENCV_INSTALL_LIB_PATH=${path of opencv}/build/install/lib
100
+ # or cross build for android aarch64:
101
+ export OPENCV_INSTALL_INCLUDE_PATH=${path of opencv}/build_android/install/sdk/native/jni/include
102
+ export OPENCV_INSTALL_LIB_PATH=${path of opencv}/build_android/install/sdk/native/libs/arm64-v8a
103
+ ```
104
+
105
+ ### Step4: build test demo
106
+
107
+ ```shell
108
+ run build.sh
109
+
110
+ # if host:
111
+ export CXX=g++
112
+ ./build.sh
113
+ # or cross android aarch64
114
+ export CXX=aarch64-linux-android21-clang++
115
+ ./build.sh
116
+ ```
117
+
118
+ ### Step5: run demo
119
+
120
+ > **Note**: two ways to get `yolox_s.mge` model file
121
+ >
122
+ > * reference to python demo's `dump.py` script.
123
+ > * For users with code before 0.1.0 version, wget yolox-s weights [here](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s.mge).
124
+ > * For users with code after 0.1.0 version, use [python code in megengine](../python) to generate mge file.
125
+
126
+ ```shell
127
+ # if host:
128
+ LD_LIBRARY_PATH=$MGE_INSTALL_PATH/lib/:$OPENCV_INSTALL_LIB_PATH ./yolox yolox_s.mge ../../../assets/dog.jpg cuda/cpu/multithread <warmup_count> <thread_number>
129
+
130
+ # or cross android
131
+ adb push/scp $MGE_INSTALL_PATH/lib/libmegengine.so android_phone
132
+ adb push/scp $OPENCV_INSTALL_LIB_PATH/*.so android_phone
133
+ adb push/scp ./yolox yolox_s.mge android_phone
134
+ adb push/scp ../../../assets/dog.jpg android_phone
135
+
136
+ # login in android_phone by adb or ssh
137
+ # then run:
138
+ LD_LIBRARY_PATH=. ./yolox yolox_s.mge dog.jpg cpu/multithread <warmup_count> <thread_number> <use_fast_run> <use_weight_preprocess> <run_with_fp16>
139
+
140
+ # * <warmup_count> means warmup count, valid number >=0
141
+ # * <thread_number> means thread number, valid number >=1, only take effect `multithread` device
142
+ # * <use_fast_run> if >=1 , will use fastrun to choose best algo
143
+ # * <use_weight_preprocess> if >=1, will handle weight preprocess before exe
144
+ # * <run_with_fp16> if >=1, will run with fp16 mode
145
+ ```
146
+
147
+ ## Bechmark
148
+
149
+ * model info: yolox-s @ input(1,3,640,640)
150
+
151
+ * test devices
152
+
153
+ ```
154
+ * x86_64 -- Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
155
+ * aarch64 -- xiamo phone mi9
156
+ * cuda -- 1080TI @ cuda-10.1-cudnn-v7.6.3-TensorRT-6.0.1.5.sh @ Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
157
+ ```
158
+
159
+ | megengine @ tag1.4(fastrun + weight\_preprocess)/sec | 1 thread |
160
+ | ---------------------------------------------------- | -------- |
161
+ | x86\_64 | 0.516245 |
162
+ | aarch64(fp32+chw44) | 0.587857 |
163
+
164
+ | CUDA @ 1080TI/sec | 1 batch | 2 batch | 4 batch | 8 batch | 16 batch | 32 batch | 64 batch |
165
+ | ------------------- | ---------- | --------- | --------- | --------- | --------- | -------- | -------- |
166
+ | megengine(fp32+chw) | 0.00813703 | 0.0132893 | 0.0236633 | 0.0444699 | 0.0864917 | 0.16895 | 0.334248 |
167
+
168
+ ## Acknowledgement
169
+
170
+ * [MegEngine](https://github.com/MegEngine/MegEngine)
171
+ * [OpenCV](https://github.com/opencv/opencv)
172
+ * [NDK](https://developer.android.com/ndk)
173
+ * [CMAKE](https://cmake.org/)
multimodal/YOLOX/demo/MegEngine/cpp/build.sh ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -e
3
+
4
+ if [ -z $CXX ];then
5
+ echo "please export you c++ toolchain to CXX"
6
+ echo "for example:"
7
+ echo "build for host: export CXX=g++"
8
+ echo "cross build for aarch64-android(always locate in NDK): export CXX=aarch64-linux-android21-clang++"
9
+ echo "cross build for aarch64-linux: export CXX=aarch64-linux-gnu-g++"
10
+ exit -1
11
+ fi
12
+
13
+ if [ -z $MGE_INSTALL_PATH ];then
14
+ echo "please refsi ./README.md to init MGE_INSTALL_PATH env"
15
+ exit -1
16
+ fi
17
+
18
+ if [ -z $OPENCV_INSTALL_INCLUDE_PATH ];then
19
+ echo "please refs ./README.md to init OPENCV_INSTALL_INCLUDE_PATH env"
20
+ exit -1
21
+ fi
22
+
23
+ if [ -z $OPENCV_INSTALL_LIB_PATH ];then
24
+ echo "please refs ./README.md to init OPENCV_INSTALL_LIB_PATH env"
25
+ exit -1
26
+ fi
27
+
28
+ INCLUDE_FLAG="-I$MGE_INSTALL_PATH/include -I$OPENCV_INSTALL_INCLUDE_PATH"
29
+ LINK_FLAG="-L$MGE_INSTALL_PATH/lib/ -lmegengine -L$OPENCV_INSTALL_LIB_PATH -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_imgcodecs"
30
+ BUILD_FLAG="-static-libstdc++ -O3 -pie -fPIE -g"
31
+
32
+ if [[ $CXX =~ "android" ]]; then
33
+ LINK_FLAG="${LINK_FLAG} -llog -lz"
34
+ fi
35
+
36
+ echo "CXX: $CXX"
37
+ echo "MGE_INSTALL_PATH: $MGE_INSTALL_PATH"
38
+ echo "INCLUDE_FLAG: $INCLUDE_FLAG"
39
+ echo "LINK_FLAG: $LINK_FLAG"
40
+ echo "BUILD_FLAG: $BUILD_FLAG"
41
+
42
+ echo "[" > compile_commands.json
43
+ echo "{" >> compile_commands.json
44
+ echo "\"directory\": \"$PWD\"," >> compile_commands.json
45
+ echo "\"command\": \"$CXX yolox.cpp -o yolox ${INCLUDE_FLAG} ${LINK_FLAG}\"," >> compile_commands.json
46
+ echo "\"file\": \"$PWD/yolox.cpp\"," >> compile_commands.json
47
+ echo "}," >> compile_commands.json
48
+ echo "]" >> compile_commands.json
49
+ $CXX yolox.cpp -o yolox ${INCLUDE_FLAG} ${LINK_FLAG} ${BUILD_FLAG}
50
+
51
+ echo "build success, output file: yolox"
52
+ if [[ $CXX =~ "android" ]]; then
53
+ echo "try command to run:"
54
+ echo "adb push/scp $MGE_INSTALL_PATH/lib/libmegengine.so android_phone"
55
+ echo "adb push/scp $OPENCV_INSTALL_LIB_PATH/*.so android_phone"
56
+ echo "adb push/scp ./yolox yolox_s.mge android_phone"
57
+ echo "adb push/scp ../../../assets/dog.jpg android_phone"
58
+ echo "adb/ssh to android_phone, then run: LD_LIBRARY_PATH=. ./yolox yolox_s.mge dog.jpg cpu/multithread <warmup_count> <thread_number> <use_fast_run> <use_weight_preprocess>"
59
+ else
60
+ echo "try command to run: LD_LIBRARY_PATH=$MGE_INSTALL_PATH/lib/:$OPENCV_INSTALL_LIB_PATH ./yolox yolox_s.mge ../../../assets/dog.jpg cuda/cpu/multithread <warmup_count> <thread_number> <use_fast_run> <use_weight_preprocess>"
61
+ fi
multimodal/YOLOX/demo/MegEngine/cpp/yolox.cpp ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2021 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "megbrain/gopt/inference.h"
5
+ #include "megbrain/opr/search_policy/algo_chooser_helper.h"
6
+ #include "megbrain/serialization/serializer.h"
7
+ #include <iostream>
8
+ #include <iterator>
9
+ #include <memory>
10
+ #include <opencv2/opencv.hpp>
11
+ #include <stdlib.h>
12
+ #include <string>
13
+ #include <vector>
14
+
15
+ /**
16
+ * @brief Define names based depends on Unicode path support
17
+ */
18
+ #define NMS_THRESH 0.45
19
+ #define BBOX_CONF_THRESH 0.25
20
+
21
+ constexpr int INPUT_W = 640;
22
+ constexpr int INPUT_H = 640;
23
+
24
+ using namespace mgb;
25
+
26
+ cv::Mat static_resize(cv::Mat &img) {
27
+ float r = std::min(INPUT_W / (img.cols * 1.0), INPUT_H / (img.rows * 1.0));
28
+ int unpad_w = r * img.cols;
29
+ int unpad_h = r * img.rows;
30
+ cv::Mat re(unpad_h, unpad_w, CV_8UC3);
31
+ cv::resize(img, re, re.size());
32
+ cv::Mat out(INPUT_W, INPUT_H, CV_8UC3, cv::Scalar(114, 114, 114));
33
+ re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
34
+ return out;
35
+ }
36
+
37
+ void blobFromImage(cv::Mat &img, float *blob_data) {
38
+ int channels = 3;
39
+ int img_h = img.rows;
40
+ int img_w = img.cols;
41
+ for (size_t c = 0; c < channels; c++) {
42
+ for (size_t h = 0; h < img_h; h++) {
43
+ for (size_t w = 0; w < img_w; w++) {
44
+ blob_data[c * img_w * img_h + h * img_w + w] =
45
+ (float)img.at<cv::Vec3b>(h, w)[c];
46
+ }
47
+ }
48
+ }
49
+ }
50
+
51
+ struct Object {
52
+ cv::Rect_<float> rect;
53
+ int label;
54
+ float prob;
55
+ };
56
+
57
+ struct GridAndStride {
58
+ int grid0;
59
+ int grid1;
60
+ int stride;
61
+ };
62
+
63
+ static void
64
+ generate_grids_and_stride(const int target_size, std::vector<int> &strides,
65
+ std::vector<GridAndStride> &grid_strides) {
66
+ for (auto stride : strides) {
67
+ int num_grid = target_size / stride;
68
+ for (int g1 = 0; g1 < num_grid; g1++) {
69
+ for (int g0 = 0; g0 < num_grid; g0++) {
70
+ grid_strides.push_back((GridAndStride){g0, g1, stride});
71
+ }
72
+ }
73
+ }
74
+ }
75
+
76
+ static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides,
77
+ const float *feat_ptr,
78
+ float prob_threshold,
79
+ std::vector<Object> &objects) {
80
+ const int num_class = 80;
81
+ const int num_anchors = grid_strides.size();
82
+
83
+ for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++) {
84
+ const int grid0 = grid_strides[anchor_idx].grid0;
85
+ const int grid1 = grid_strides[anchor_idx].grid1;
86
+ const int stride = grid_strides[anchor_idx].stride;
87
+
88
+ const int basic_pos = anchor_idx * 85;
89
+
90
+ float x_center = (feat_ptr[basic_pos + 0] + grid0) * stride;
91
+ float y_center = (feat_ptr[basic_pos + 1] + grid1) * stride;
92
+ float w = exp(feat_ptr[basic_pos + 2]) * stride;
93
+ float h = exp(feat_ptr[basic_pos + 3]) * stride;
94
+ float x0 = x_center - w * 0.5f;
95
+ float y0 = y_center - h * 0.5f;
96
+
97
+ float box_objectness = feat_ptr[basic_pos + 4];
98
+ for (int class_idx = 0; class_idx < num_class; class_idx++) {
99
+ float box_cls_score = feat_ptr[basic_pos + 5 + class_idx];
100
+ float box_prob = box_objectness * box_cls_score;
101
+ if (box_prob > prob_threshold) {
102
+ Object obj;
103
+ obj.rect.x = x0;
104
+ obj.rect.y = y0;
105
+ obj.rect.width = w;
106
+ obj.rect.height = h;
107
+ obj.label = class_idx;
108
+ obj.prob = box_prob;
109
+
110
+ objects.push_back(obj);
111
+ }
112
+
113
+ } // class loop
114
+
115
+ } // point anchor loop
116
+ }
117
+
118
+ static inline float intersection_area(const Object &a, const Object &b) {
119
+ cv::Rect_<float> inter = a.rect & b.rect;
120
+ return inter.area();
121
+ }
122
+
123
+ static void qsort_descent_inplace(std::vector<Object> &faceobjects, int left,
124
+ int right) {
125
+ int i = left;
126
+ int j = right;
127
+ float p = faceobjects[(left + right) / 2].prob;
128
+
129
+ while (i <= j) {
130
+ while (faceobjects[i].prob > p)
131
+ i++;
132
+
133
+ while (faceobjects[j].prob < p)
134
+ j--;
135
+
136
+ if (i <= j) {
137
+ // swap
138
+ std::swap(faceobjects[i], faceobjects[j]);
139
+
140
+ i++;
141
+ j--;
142
+ }
143
+ }
144
+
145
+ #pragma omp parallel sections
146
+ {
147
+ #pragma omp section
148
+ {
149
+ if (left < j)
150
+ qsort_descent_inplace(faceobjects, left, j);
151
+ }
152
+ #pragma omp section
153
+ {
154
+ if (i < right)
155
+ qsort_descent_inplace(faceobjects, i, right);
156
+ }
157
+ }
158
+ }
159
+
160
+ static void qsort_descent_inplace(std::vector<Object> &objects) {
161
+ if (objects.empty())
162
+ return;
163
+
164
+ qsort_descent_inplace(objects, 0, objects.size() - 1);
165
+ }
166
+
167
+ static void nms_sorted_bboxes(const std::vector<Object> &faceobjects,
168
+ std::vector<int> &picked, float nms_threshold) {
169
+ picked.clear();
170
+
171
+ const int n = faceobjects.size();
172
+
173
+ std::vector<float> areas(n);
174
+ for (int i = 0; i < n; i++) {
175
+ areas[i] = faceobjects[i].rect.area();
176
+ }
177
+
178
+ for (int i = 0; i < n; i++) {
179
+ const Object &a = faceobjects[i];
180
+
181
+ int keep = 1;
182
+ for (int j = 0; j < (int)picked.size(); j++) {
183
+ const Object &b = faceobjects[picked[j]];
184
+
185
+ // intersection over union
186
+ float inter_area = intersection_area(a, b);
187
+ float union_area = areas[i] + areas[picked[j]] - inter_area;
188
+ // float IoU = inter_area / union_area
189
+ if (inter_area / union_area > nms_threshold)
190
+ keep = 0;
191
+ }
192
+
193
+ if (keep)
194
+ picked.push_back(i);
195
+ }
196
+ }
197
+
198
+ static void decode_outputs(const float *prob, std::vector<Object> &objects,
199
+ float scale, const int img_w, const int img_h) {
200
+ std::vector<Object> proposals;
201
+ std::vector<int> strides = {8, 16, 32};
202
+ std::vector<GridAndStride> grid_strides;
203
+
204
+ generate_grids_and_stride(INPUT_W, strides, grid_strides);
205
+ generate_yolox_proposals(grid_strides, prob, BBOX_CONF_THRESH, proposals);
206
+ qsort_descent_inplace(proposals);
207
+
208
+ std::vector<int> picked;
209
+ nms_sorted_bboxes(proposals, picked, NMS_THRESH);
210
+ int count = picked.size();
211
+ objects.resize(count);
212
+
213
+ for (int i = 0; i < count; i++) {
214
+ objects[i] = proposals[picked[i]];
215
+
216
+ // adjust offset to original unpadded
217
+ float x0 = (objects[i].rect.x) / scale;
218
+ float y0 = (objects[i].rect.y) / scale;
219
+ float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
220
+ float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
221
+
222
+ // clip
223
+ x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
224
+ y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
225
+ x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
226
+ y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
227
+
228
+ objects[i].rect.x = x0;
229
+ objects[i].rect.y = y0;
230
+ objects[i].rect.width = x1 - x0;
231
+ objects[i].rect.height = y1 - y0;
232
+ }
233
+ }
234
+
235
+ const float color_list[80][3] = {
236
+ {0.000, 0.447, 0.741}, {0.850, 0.325, 0.098}, {0.929, 0.694, 0.125},
237
+ {0.494, 0.184, 0.556}, {0.466, 0.674, 0.188}, {0.301, 0.745, 0.933},
238
+ {0.635, 0.078, 0.184}, {0.300, 0.300, 0.300}, {0.600, 0.600, 0.600},
239
+ {1.000, 0.000, 0.000}, {1.000, 0.500, 0.000}, {0.749, 0.749, 0.000},
240
+ {0.000, 1.000, 0.000}, {0.000, 0.000, 1.000}, {0.667, 0.000, 1.000},
241
+ {0.333, 0.333, 0.000}, {0.333, 0.667, 0.000}, {0.333, 1.000, 0.000},
242
+ {0.667, 0.333, 0.000}, {0.667, 0.667, 0.000}, {0.667, 1.000, 0.000},
243
+ {1.000, 0.333, 0.000}, {1.000, 0.667, 0.000}, {1.000, 1.000, 0.000},
244
+ {0.000, 0.333, 0.500}, {0.000, 0.667, 0.500}, {0.000, 1.000, 0.500},
245
+ {0.333, 0.000, 0.500}, {0.333, 0.333, 0.500}, {0.333, 0.667, 0.500},
246
+ {0.333, 1.000, 0.500}, {0.667, 0.000, 0.500}, {0.667, 0.333, 0.500},
247
+ {0.667, 0.667, 0.500}, {0.667, 1.000, 0.500}, {1.000, 0.000, 0.500},
248
+ {1.000, 0.333, 0.500}, {1.000, 0.667, 0.500}, {1.000, 1.000, 0.500},
249
+ {0.000, 0.333, 1.000}, {0.000, 0.667, 1.000}, {0.000, 1.000, 1.000},
250
+ {0.333, 0.000, 1.000}, {0.333, 0.333, 1.000}, {0.333, 0.667, 1.000},
251
+ {0.333, 1.000, 1.000}, {0.667, 0.000, 1.000}, {0.667, 0.333, 1.000},
252
+ {0.667, 0.667, 1.000}, {0.667, 1.000, 1.000}, {1.000, 0.000, 1.000},
253
+ {1.000, 0.333, 1.000}, {1.000, 0.667, 1.000}, {0.333, 0.000, 0.000},
254
+ {0.500, 0.000, 0.000}, {0.667, 0.000, 0.000}, {0.833, 0.000, 0.000},
255
+ {1.000, 0.000, 0.000}, {0.000, 0.167, 0.000}, {0.000, 0.333, 0.000},
256
+ {0.000, 0.500, 0.000}, {0.000, 0.667, 0.000}, {0.000, 0.833, 0.000},
257
+ {0.000, 1.000, 0.000}, {0.000, 0.000, 0.167}, {0.000, 0.000, 0.333},
258
+ {0.000, 0.000, 0.500}, {0.000, 0.000, 0.667}, {0.000, 0.000, 0.833},
259
+ {0.000, 0.000, 1.000}, {0.000, 0.000, 0.000}, {0.143, 0.143, 0.143},
260
+ {0.286, 0.286, 0.286}, {0.429, 0.429, 0.429}, {0.571, 0.571, 0.571},
261
+ {0.714, 0.714, 0.714}, {0.857, 0.857, 0.857}, {0.000, 0.447, 0.741},
262
+ {0.314, 0.717, 0.741}, {0.50, 0.5, 0}};
263
+
264
+ static void draw_objects(const cv::Mat &bgr,
265
+ const std::vector<Object> &objects) {
266
+ static const char *class_names[] = {
267
+ "person", "bicycle", "car",
268
+ "motorcycle", "airplane", "bus",
269
+ "train", "truck", "boat",
270
+ "traffic light", "fire hydrant", "stop sign",
271
+ "parking meter", "bench", "bird",
272
+ "cat", "dog", "horse",
273
+ "sheep", "cow", "elephant",
274
+ "bear", "zebra", "giraffe",
275
+ "backpack", "umbrella", "handbag",
276
+ "tie", "suitcase", "frisbee",
277
+ "skis", "snowboard", "sports ball",
278
+ "kite", "baseball bat", "baseball glove",
279
+ "skateboard", "surfboard", "tennis racket",
280
+ "bottle", "wine glass", "cup",
281
+ "fork", "knife", "spoon",
282
+ "bowl", "banana", "apple",
283
+ "sandwich", "orange", "broccoli",
284
+ "carrot", "hot dog", "pizza",
285
+ "donut", "cake", "chair",
286
+ "couch", "potted plant", "bed",
287
+ "dining table", "toilet", "tv",
288
+ "laptop", "mouse", "remote",
289
+ "keyboard", "cell phone", "microwave",
290
+ "oven", "toaster", "sink",
291
+ "refrigerator", "book", "clock",
292
+ "vase", "scissors", "teddy bear",
293
+ "hair drier", "toothbrush"};
294
+
295
+ cv::Mat image = bgr.clone();
296
+
297
+ for (size_t i = 0; i < objects.size(); i++) {
298
+ const Object &obj = objects[i];
299
+
300
+ fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
301
+ obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
302
+
303
+ cv::Scalar color =
304
+ cv::Scalar(color_list[obj.label][0], color_list[obj.label][1],
305
+ color_list[obj.label][2]);
306
+ float c_mean = cv::mean(color)[0];
307
+ cv::Scalar txt_color;
308
+ if (c_mean > 0.5) {
309
+ txt_color = cv::Scalar(0, 0, 0);
310
+ } else {
311
+ txt_color = cv::Scalar(255, 255, 255);
312
+ }
313
+
314
+ cv::rectangle(image, obj.rect, color * 255, 2);
315
+
316
+ char text[256];
317
+ sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
318
+
319
+ int baseLine = 0;
320
+ cv::Size label_size =
321
+ cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
322
+
323
+ cv::Scalar txt_bk_color = color * 0.7 * 255;
324
+
325
+ int x = obj.rect.x;
326
+ int y = obj.rect.y + 1;
327
+ // int y = obj.rect.y - label_size.height - baseLine;
328
+ if (y > image.rows)
329
+ y = image.rows;
330
+ // if (x + label_size.width > image.cols)
331
+ // x = image.cols - label_size.width;
332
+
333
+ cv::rectangle(
334
+ image,
335
+ cv::Rect(cv::Point(x, y),
336
+ cv::Size(label_size.width, label_size.height + baseLine)),
337
+ txt_bk_color, -1);
338
+
339
+ cv::putText(image, text, cv::Point(x, y + label_size.height),
340
+ cv::FONT_HERSHEY_SIMPLEX, 0.4, txt_color, 1);
341
+ }
342
+
343
+ cv::imwrite("out.jpg", image);
344
+ std::cout << "save output to out.jpg" << std::endl;
345
+ }
346
+
347
+ cg::ComputingGraph::OutputSpecItem make_callback_copy(SymbolVar dev,
348
+ HostTensorND &host) {
349
+ auto cb = [&host](DeviceTensorND &d) { host.copy_from(d); };
350
+ return {dev, cb};
351
+ }
352
+
353
+ int main(int argc, char *argv[]) {
354
+ serialization::GraphLoader::LoadConfig load_config;
355
+ load_config.comp_graph = ComputingGraph::make();
356
+ auto &&graph_opt = load_config.comp_graph->options();
357
+ graph_opt.graph_opt_level = 0;
358
+
359
+ if (argc != 9) {
360
+ std::cout << "Usage : " << argv[0]
361
+ << " <path_to_model> <path_to_image> <device> <warmup_count> "
362
+ "<thread_number> <use_fast_run> <use_weight_preprocess> "
363
+ "<run_with_fp16>"
364
+ << std::endl;
365
+ return EXIT_FAILURE;
366
+ }
367
+
368
+ const std::string input_model{argv[1]};
369
+ const std::string input_image_path{argv[2]};
370
+ const std::string device{argv[3]};
371
+ const size_t warmup_count = atoi(argv[4]);
372
+ const size_t thread_number = atoi(argv[5]);
373
+ const size_t use_fast_run = atoi(argv[6]);
374
+ const size_t use_weight_preprocess = atoi(argv[7]);
375
+ const size_t run_with_fp16 = atoi(argv[8]);
376
+
377
+ if (device == "cuda") {
378
+ load_config.comp_node_mapper = [](CompNode::Locator &loc) {
379
+ loc.type = CompNode::DeviceType::CUDA;
380
+ };
381
+ } else if (device == "cpu") {
382
+ load_config.comp_node_mapper = [](CompNode::Locator &loc) {
383
+ loc.type = CompNode::DeviceType::CPU;
384
+ };
385
+ } else if (device == "multithread") {
386
+ load_config.comp_node_mapper = [thread_number](CompNode::Locator &loc) {
387
+ loc.type = CompNode::DeviceType::MULTITHREAD;
388
+ loc.device = 0;
389
+ loc.stream = thread_number;
390
+ };
391
+ std::cout << "use " << thread_number << " thread" << std::endl;
392
+ } else {
393
+ std::cout << "device only support cuda or cpu or multithread" << std::endl;
394
+ return EXIT_FAILURE;
395
+ }
396
+
397
+ if (use_weight_preprocess) {
398
+ std::cout << "use weight preprocess" << std::endl;
399
+ graph_opt.graph_opt.enable_weight_preprocess();
400
+ }
401
+ if (run_with_fp16) {
402
+ std::cout << "run with fp16" << std::endl;
403
+ graph_opt.graph_opt.enable_f16_io_comp();
404
+ }
405
+
406
+ if (device == "cuda") {
407
+ std::cout << "choose format for cuda" << std::endl;
408
+ } else {
409
+ std::cout << "choose format for non-cuda" << std::endl;
410
+ #if defined(__arm__) || defined(__aarch64__)
411
+ if (run_with_fp16) {
412
+ std::cout << "use chw format when enable fp16" << std::endl;
413
+ } else {
414
+ std::cout << "choose format for nchw44 for aarch64" << std::endl;
415
+ graph_opt.graph_opt.enable_nchw44();
416
+ }
417
+ #endif
418
+ #if defined(__x86_64__) || defined(__amd64__) || defined(__i386__)
419
+ // graph_opt.graph_opt.enable_nchw88();
420
+ #endif
421
+ }
422
+
423
+ std::unique_ptr<serialization::InputFile> inp_file =
424
+ serialization::InputFile::make_fs(input_model.c_str());
425
+ auto loader = serialization::GraphLoader::make(std::move(inp_file));
426
+ serialization::GraphLoader::LoadResult network =
427
+ loader->load(load_config, false);
428
+
429
+ if (use_fast_run) {
430
+ std::cout << "use fastrun" << std::endl;
431
+ using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
432
+ S strategy = static_cast<S>(0);
433
+ strategy = S::PROFILE | S::OPTIMIZED | strategy;
434
+ mgb::gopt::modify_opr_algo_strategy_inplace(network.output_var_list,
435
+ strategy);
436
+ }
437
+
438
+ auto data = network.tensor_map["data"];
439
+ cv::Mat image = cv::imread(input_image_path);
440
+ cv::Mat pr_img = static_resize(image);
441
+ float *data_ptr = data->resize({1, 3, 640, 640}).ptr<float>();
442
+ blobFromImage(pr_img, data_ptr);
443
+ HostTensorND predict;
444
+ std::unique_ptr<cg::AsyncExecutable> func = network.graph->compile(
445
+ {make_callback_copy(network.output_var_map.begin()->second, predict)});
446
+
447
+ for (auto i = 0; i < warmup_count; i++) {
448
+ std::cout << "warmup: " << i << std::endl;
449
+ func->execute();
450
+ func->wait();
451
+ }
452
+ auto start = std::chrono::system_clock::now();
453
+ func->execute();
454
+ func->wait();
455
+ auto end = std::chrono::system_clock::now();
456
+ std::chrono::duration<double> exec_seconds = end - start;
457
+ std::cout << "elapsed time: " << exec_seconds.count() << "s" << std::endl;
458
+
459
+ float *predict_ptr = predict.ptr<float>();
460
+ int img_w = image.cols;
461
+ int img_h = image.rows;
462
+ float scale =
463
+ std::min(INPUT_W / (image.cols * 1.0), INPUT_H / (image.rows * 1.0));
464
+ std::vector<Object> objects;
465
+
466
+ decode_outputs(predict_ptr, objects, scale, img_w, img_h);
467
+ draw_objects(image, objects);
468
+
469
+ return EXIT_SUCCESS;
470
+ }
multimodal/YOLOX/demo/MegEngine/python/README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # YOLOX-Python-MegEngine
2
+
3
+ Python version of YOLOX object detection base on [MegEngine](https://github.com/MegEngine/MegEngine).
4
+
5
+ ## Tutorial
6
+
7
+ ### Step1: install requirements
8
+
9
+ ```
10
+ python3 -m pip install megengine -f https://megengine.org.cn/whl/mge.html
11
+ ```
12
+
13
+ ### Step2: convert checkpoint weights from torch's path file
14
+
15
+ ```
16
+ python3 convert_weights.py -w yolox_s.pth -o yolox_s_mge.pkl
17
+ ```
18
+
19
+ ### Step3: run demo
20
+
21
+ This part is the same as torch's python demo, but no need to specify device.
22
+
23
+ ```
24
+ python3 demo.py image -n yolox-s -c yolox_s_mge.pkl --path ../../../assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result
25
+ ```
26
+
27
+ ### [Optional]Step4: dump model for cpp inference
28
+
29
+ > **Note**: result model is dumped with `optimize_for_inference` and `enable_fuse_conv_bias_nonlinearity`.
30
+
31
+ ```
32
+ python3 dump.py -n yolox-s -c yolox_s_mge.pkl --dump_path yolox_s.mge
33
+ ```
multimodal/YOLOX/demo/MegEngine/python/build.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding:utf-8 -*-
3
+
4
+ import megengine as mge
5
+ import megengine.module as M
6
+
7
+ from models.yolo_fpn import YOLOFPN
8
+ from models.yolo_head import YOLOXHead
9
+ from models.yolo_pafpn import YOLOPAFPN
10
+ from models.yolox import YOLOX
11
+
12
+
13
+ def build_yolox(name="yolox-s"):
14
+ num_classes = 80
15
+
16
+ # value meaning: depth, width
17
+ param_dict = {
18
+ "yolox-nano": (0.33, 0.25),
19
+ "yolox-tiny": (0.33, 0.375),
20
+ "yolox-s": (0.33, 0.50),
21
+ "yolox-m": (0.67, 0.75),
22
+ "yolox-l": (1.0, 1.0),
23
+ "yolox-x": (1.33, 1.25),
24
+ }
25
+ if name == "yolov3":
26
+ depth = 1.0
27
+ width = 1.0
28
+ backbone = YOLOFPN()
29
+ head = YOLOXHead(num_classes, width, in_channels=[128, 256, 512], act="lrelu")
30
+ model = YOLOX(backbone, head)
31
+ else:
32
+ assert name in param_dict
33
+ kwargs = {}
34
+ depth, width = param_dict[name]
35
+ if name == "yolox-nano":
36
+ kwargs["depthwise"] = True
37
+ in_channels = [256, 512, 1024]
38
+ backbone = YOLOPAFPN(depth, width, in_channels=in_channels, **kwargs)
39
+ head = YOLOXHead(num_classes, width, in_channels=in_channels, **kwargs)
40
+ model = YOLOX(backbone, head)
41
+
42
+ for m in model.modules():
43
+ if isinstance(m, M.BatchNorm2d):
44
+ m.eps = 1e-3
45
+
46
+ return model
47
+
48
+
49
+ def build_and_load(weight_file, name="yolox-s"):
50
+ model = build_yolox(name)
51
+ model_weights = mge.load(weight_file)
52
+ model.load_state_dict(model_weights, strict=False)
53
+ return model
multimodal/YOLOX/demo/MegEngine/python/convert_weights.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding:utf-8 -*-
3
+ import argparse
4
+ from collections import OrderedDict
5
+
6
+ import megengine as mge
7
+ import torch
8
+
9
+
10
+ def make_parser():
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("-w", "--weights", type=str, help="path of weight file")
13
+ parser.add_argument(
14
+ "-o",
15
+ "--output",
16
+ default="weight_mge.pkl",
17
+ type=str,
18
+ help="path of weight file",
19
+ )
20
+ return parser
21
+
22
+
23
+ def numpy_weights(weight_file):
24
+ torch_weights = torch.load(weight_file, map_location="cpu")
25
+ if "model" in torch_weights:
26
+ torch_weights = torch_weights["model"]
27
+ new_dict = OrderedDict()
28
+ for k, v in torch_weights.items():
29
+ new_dict[k] = v.cpu().numpy()
30
+ return new_dict
31
+
32
+
33
+ def map_weights(weight_file, output_file):
34
+ torch_weights = numpy_weights(weight_file)
35
+
36
+ new_dict = OrderedDict()
37
+ for k, v in torch_weights.items():
38
+ if "num_batches_tracked" in k:
39
+ print("drop: {}".format(k))
40
+ continue
41
+ if k.endswith("bias"):
42
+ print("bias key: {}".format(k))
43
+ v = v.reshape(1, -1, 1, 1)
44
+ new_dict[k] = v
45
+ elif "dconv" in k and "conv.weight" in k:
46
+ print("depthwise conv key: {}".format(k))
47
+ cout, cin, k1, k2 = v.shape
48
+ v = v.reshape(cout, 1, cin, k1, k2)
49
+ new_dict[k] = v
50
+ else:
51
+ new_dict[k] = v
52
+
53
+ mge.save(new_dict, output_file)
54
+ print("save weights to {}".format(output_file))
55
+
56
+
57
+ def main():
58
+ parser = make_parser()
59
+ args = parser.parse_args()
60
+ map_weights(args.weights, args.output)
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()
multimodal/YOLOX/demo/MegEngine/python/demo.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding:utf-8 -*-
3
+ # Copyright (c) Megvii, Inc. and its affiliates.
4
+
5
+ import argparse
6
+ import os
7
+ import time
8
+
9
+ import cv2
10
+ import megengine as mge
11
+ import megengine.functional as F
12
+ from loguru import logger
13
+
14
+ from yolox.data.datasets import COCO_CLASSES
15
+ from yolox.utils import vis
16
+ from yolox.data.data_augment import preproc as preprocess
17
+
18
+ from build import build_and_load
19
+
20
+ IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"]
21
+
22
+
23
+ def make_parser():
24
+ parser = argparse.ArgumentParser("YOLOX Demo!")
25
+ parser.add_argument(
26
+ "demo", default="image", help="demo type, eg. image, video and webcam"
27
+ )
28
+ parser.add_argument("-n", "--name", type=str, default="yolox-s", help="model name")
29
+ parser.add_argument("--path", default="./test.png", help="path to images or video")
30
+ parser.add_argument("--camid", type=int, default=0, help="webcam demo camera id")
31
+ parser.add_argument(
32
+ "--save_result",
33
+ action="store_true",
34
+ help="whether to save the inference result of image/video",
35
+ )
36
+
37
+ parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
38
+ parser.add_argument("--conf", default=None, type=float, help="test conf")
39
+ parser.add_argument("--nms", default=None, type=float, help="test nms threshold")
40
+ parser.add_argument("--tsize", default=None, type=int, help="test img size")
41
+ return parser
42
+
43
+
44
+ def get_image_list(path):
45
+ image_names = []
46
+ for maindir, subdir, file_name_list in os.walk(path):
47
+ for filename in file_name_list:
48
+ apath = os.path.join(maindir, filename)
49
+ ext = os.path.splitext(apath)[1]
50
+ if ext in IMAGE_EXT:
51
+ image_names.append(apath)
52
+ return image_names
53
+
54
+
55
+ def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45):
56
+ box_corner = F.zeros_like(prediction)
57
+ box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
58
+ box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
59
+ box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
60
+ box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
61
+ prediction[:, :, :4] = box_corner[:, :, :4]
62
+
63
+ output = [None for _ in range(len(prediction))]
64
+ for i, image_pred in enumerate(prediction):
65
+
66
+ # If none are remaining => process next image
67
+ if not image_pred.shape[0]:
68
+ continue
69
+ # Get score and class with highest confidence
70
+ class_conf = F.max(image_pred[:, 5: 5 + num_classes], 1, keepdims=True)
71
+ class_pred = F.argmax(image_pred[:, 5: 5 + num_classes], 1, keepdims=True)
72
+
73
+ class_conf_squeeze = F.squeeze(class_conf)
74
+ conf_mask = image_pred[:, 4] * class_conf_squeeze >= conf_thre
75
+ detections = F.concat((image_pred[:, :5], class_conf, class_pred), 1)
76
+ detections = detections[conf_mask]
77
+ if not detections.shape[0]:
78
+ continue
79
+
80
+ nms_out_index = F.vision.nms(
81
+ detections[:, :4], detections[:, 4] * detections[:, 5], nms_thre,
82
+ )
83
+ detections = detections[nms_out_index]
84
+ if output[i] is None:
85
+ output[i] = detections
86
+ else:
87
+ output[i] = F.concat((output[i], detections))
88
+
89
+ return output
90
+
91
+
92
+ class Predictor(object):
93
+ def __init__(
94
+ self,
95
+ model,
96
+ confthre=0.01,
97
+ nmsthre=0.65,
98
+ test_size=(640, 640),
99
+ cls_names=COCO_CLASSES,
100
+ trt_file=None,
101
+ decoder=None,
102
+ ):
103
+ self.model = model
104
+ self.cls_names = cls_names
105
+ self.decoder = decoder
106
+ self.num_classes = 80
107
+ self.confthre = confthre
108
+ self.nmsthre = nmsthre
109
+ self.test_size = test_size
110
+
111
+ def inference(self, img):
112
+ img_info = {"id": 0}
113
+ if isinstance(img, str):
114
+ img_info["file_name"] = os.path.basename(img)
115
+ img = cv2.imread(img)
116
+ if img is None:
117
+ raise ValueError("test image path is invalid!")
118
+ else:
119
+ img_info["file_name"] = None
120
+
121
+ height, width = img.shape[:2]
122
+ img_info["height"] = height
123
+ img_info["width"] = width
124
+ img_info["raw_img"] = img
125
+
126
+ img, ratio = preprocess(img, self.test_size)
127
+ img_info["ratio"] = ratio
128
+ img = F.expand_dims(mge.tensor(img), 0)
129
+
130
+ t0 = time.time()
131
+ outputs = self.model(img)
132
+ outputs = postprocess(outputs, self.num_classes, self.confthre, self.nmsthre)
133
+ logger.info("Infer time: {:.4f}s".format(time.time() - t0))
134
+ return outputs, img_info
135
+
136
+ def visual(self, output, img_info, cls_conf=0.35):
137
+ ratio = img_info["ratio"]
138
+ img = img_info["raw_img"]
139
+ if output is None:
140
+ return img
141
+ output = output.numpy()
142
+
143
+ # preprocessing: resize
144
+ bboxes = output[:, 0:4] / ratio
145
+
146
+ cls = output[:, 6]
147
+ scores = output[:, 4] * output[:, 5]
148
+
149
+ vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
150
+ return vis_res
151
+
152
+
153
+ def image_demo(predictor, vis_folder, path, current_time, save_result):
154
+ if os.path.isdir(path):
155
+ files = get_image_list(path)
156
+ else:
157
+ files = [path]
158
+ files.sort()
159
+ for image_name in files:
160
+ outputs, img_info = predictor.inference(image_name)
161
+ result_image = predictor.visual(outputs[0], img_info)
162
+ if save_result:
163
+ save_folder = os.path.join(
164
+ vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
165
+ )
166
+ os.makedirs(save_folder, exist_ok=True)
167
+ save_file_name = os.path.join(save_folder, os.path.basename(image_name))
168
+ logger.info("Saving detection result in {}".format(save_file_name))
169
+ cv2.imwrite(save_file_name, result_image)
170
+ ch = cv2.waitKey(0)
171
+ if ch == 27 or ch == ord("q") or ch == ord("Q"):
172
+ break
173
+
174
+
175
+ def imageflow_demo(predictor, vis_folder, current_time, args):
176
+ cap = cv2.VideoCapture(args.path if args.demo == "video" else args.camid)
177
+ width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) # float
178
+ height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) # float
179
+ fps = cap.get(cv2.CAP_PROP_FPS)
180
+ save_folder = os.path.join(
181
+ vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
182
+ )
183
+ os.makedirs(save_folder, exist_ok=True)
184
+ if args.demo == "video":
185
+ save_path = os.path.join(save_folder, os.path.basename(args.path))
186
+ else:
187
+ save_path = os.path.join(save_folder, "camera.mp4")
188
+ logger.info(f"video save_path is {save_path}")
189
+ vid_writer = cv2.VideoWriter(
190
+ save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
191
+ )
192
+ while True:
193
+ ret_val, frame = cap.read()
194
+ if ret_val:
195
+ outputs, img_info = predictor.inference(frame)
196
+ result_frame = predictor.visual(outputs[0], img_info)
197
+ if args.save_result:
198
+ vid_writer.write(result_frame)
199
+ ch = cv2.waitKey(1)
200
+ if ch == 27 or ch == ord("q") or ch == ord("Q"):
201
+ break
202
+ else:
203
+ break
204
+
205
+
206
+ def main(args):
207
+ file_name = os.path.join("./yolox_outputs", args.name)
208
+ os.makedirs(file_name, exist_ok=True)
209
+
210
+ if args.save_result:
211
+ vis_folder = os.path.join(file_name, "vis_res")
212
+ os.makedirs(vis_folder, exist_ok=True)
213
+
214
+ confthre = 0.01
215
+ nmsthre = 0.65
216
+ test_size = (640, 640)
217
+ if args.conf is not None:
218
+ confthre = args.conf
219
+ if args.nms is not None:
220
+ nmsthre = args.nms
221
+ if args.tsize is not None:
222
+ test_size = (args.tsize, args.tsize)
223
+
224
+ model = build_and_load(args.ckpt, name=args.name)
225
+ model.eval()
226
+
227
+ predictor = Predictor(model, confthre, nmsthre, test_size, COCO_CLASSES, None, None)
228
+ current_time = time.localtime()
229
+ if args.demo == "image":
230
+ image_demo(predictor, vis_folder, args.path, current_time, args.save_result)
231
+ elif args.demo == "video" or args.demo == "webcam":
232
+ imageflow_demo(predictor, vis_folder, current_time, args)
233
+
234
+
235
+ if __name__ == "__main__":
236
+ args = make_parser().parse_args()
237
+ main(args)
multimodal/YOLOX/demo/MegEngine/python/dump.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding:utf-8 -*-
3
+ # Copyright (c) Megvii, Inc. and its affiliates.
4
+
5
+ import argparse
6
+
7
+ import megengine as mge
8
+ import numpy as np
9
+ from megengine import jit
10
+
11
+ from build import build_and_load
12
+
13
+
14
+ def make_parser():
15
+ parser = argparse.ArgumentParser("YOLOX Demo Dump")
16
+ parser.add_argument("-n", "--name", type=str, default="yolox-s", help="model name")
17
+ parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
18
+ parser.add_argument(
19
+ "--dump_path", default="model.mge", help="path to save the dumped model"
20
+ )
21
+ return parser
22
+
23
+
24
+ def dump_static_graph(model, graph_name="model.mge"):
25
+ model.eval()
26
+ model.head.decode_in_inference = False
27
+
28
+ data = mge.Tensor(np.random.random((1, 3, 640, 640)))
29
+
30
+ @jit.trace(capture_as_const=True)
31
+ def pred_func(data):
32
+ outputs = model(data)
33
+ return outputs
34
+
35
+ pred_func(data)
36
+ pred_func.dump(
37
+ graph_name,
38
+ arg_names=["data"],
39
+ optimize_for_inference=True,
40
+ enable_fuse_conv_bias_nonlinearity=True,
41
+ )
42
+
43
+
44
+ def main(args):
45
+ model = build_and_load(args.ckpt, name=args.name)
46
+ dump_static_graph(model, args.dump_path)
47
+
48
+
49
+ if __name__ == "__main__":
50
+ args = make_parser().parse_args()
51
+ main(args)
multimodal/YOLOX/demo/MegEngine/python/models/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding:utf-8 -*-
3
+ # Copyright (c) Megvii Inc. All rights reserved.
4
+
5
+ from .darknet import CSPDarknet, Darknet
6
+ from .yolo_fpn import YOLOFPN
7
+ from .yolo_head import YOLOXHead
8
+ from .yolo_pafpn import YOLOPAFPN
9
+ from .yolox import YOLOX
multimodal/YOLOX/demo/MegEngine/python/models/darknet.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright (c) Megvii Inc. All rights reserved.
4
+
5
+ import megengine.module as M
6
+
7
+ from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
8
+
9
+
10
+ class Darknet(M.Module):
11
+ # number of blocks from dark2 to dark5.
12
+ depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
13
+
14
+ def __init__(
15
+ self, depth, in_channels=3, stem_out_channels=32, out_features=("dark3", "dark4", "dark5"),
16
+ ):
17
+ """
18
+ Args:
19
+ depth (int): depth of darknet used in model, usually use [21, 53] for this param.
20
+ in_channels (int): number of input channels, for example, use 3 for RGB image.
21
+ stem_out_channels (int): number of output channels of darknet stem.
22
+ It decides channels of darknet layer2 to layer5.
23
+ out_features (Tuple[str]): desired output layer name.
24
+ """
25
+ super().__init__()
26
+ assert out_features, "please provide output features of Darknet"
27
+ self.out_features = out_features
28
+ self.stem = M.Sequential(
29
+ BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"),
30
+ *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
31
+ )
32
+ in_channels = stem_out_channels * 2 # 64
33
+
34
+ num_blocks = Darknet.depth2blocks[depth]
35
+ # create darknet with `stem_out_channels` and `num_blocks` layers.
36
+ # to make model structure more clear, we don't use `for` statement in python.
37
+ self.dark2 = M.Sequential(*self.make_group_layer(in_channels, num_blocks[0], stride=2))
38
+ in_channels *= 2 # 128
39
+ self.dark3 = M.Sequential(*self.make_group_layer(in_channels, num_blocks[1], stride=2))
40
+ in_channels *= 2 # 256
41
+ self.dark4 = M.Sequential(*self.make_group_layer(in_channels, num_blocks[2], stride=2))
42
+ in_channels *= 2 # 512
43
+
44
+ self.dark5 = M.Sequential(
45
+ *self.make_group_layer(in_channels, num_blocks[3], stride=2),
46
+ *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2),
47
+ )
48
+
49
+ def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1):
50
+ "starts with conv layer then has `num_blocks` `ResLayer`"
51
+ return [
52
+ BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"),
53
+ *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)]
54
+ ]
55
+
56
+ def make_spp_block(self, filters_list, in_filters):
57
+ m = M.Sequential(
58
+ *[
59
+ BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"),
60
+ BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
61
+ SPPBottleneck(
62
+ in_channels=filters_list[1],
63
+ out_channels=filters_list[0],
64
+ activation="lrelu"
65
+ ),
66
+ BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
67
+ BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"),
68
+ ]
69
+ )
70
+ return m
71
+
72
+ def forward(self, x):
73
+ outputs = {}
74
+ x = self.stem(x)
75
+ outputs["stem"] = x
76
+ x = self.dark2(x)
77
+ outputs["dark2"] = x
78
+ x = self.dark3(x)
79
+ outputs["dark3"] = x
80
+ x = self.dark4(x)
81
+ outputs["dark4"] = x
82
+ x = self.dark5(x)
83
+ outputs["dark5"] = x
84
+ return {k: v for k, v in outputs.items() if k in self.out_features}
85
+
86
+
87
+ class CSPDarknet(M.Module):
88
+
89
+ def __init__(
90
+ self, dep_mul, wid_mul,
91
+ out_features=("dark3", "dark4", "dark5"),
92
+ depthwise=False, act="silu",
93
+ ):
94
+ super().__init__()
95
+ assert out_features, "please provide output features of Darknet"
96
+ self.out_features = out_features
97
+ Conv = DWConv if depthwise else BaseConv
98
+
99
+ base_channels = int(wid_mul * 64) # 64
100
+ base_depth = max(round(dep_mul * 3), 1) # 3
101
+
102
+ # stem
103
+ self.stem = Focus(3, base_channels, ksize=3, act=act)
104
+
105
+ # dark2
106
+ self.dark2 = M.Sequential(
107
+ Conv(base_channels, base_channels * 2, 3, 2, act=act),
108
+ CSPLayer(
109
+ base_channels * 2, base_channels * 2,
110
+ n=base_depth, depthwise=depthwise, act=act
111
+ ),
112
+ )
113
+
114
+ # dark3
115
+ self.dark3 = M.Sequential(
116
+ Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
117
+ CSPLayer(
118
+ base_channels * 4, base_channels * 4,
119
+ n=base_depth * 3, depthwise=depthwise, act=act,
120
+ ),
121
+ )
122
+
123
+ # dark4
124
+ self.dark4 = M.Sequential(
125
+ Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
126
+ CSPLayer(
127
+ base_channels * 8, base_channels * 8,
128
+ n=base_depth * 3, depthwise=depthwise, act=act,
129
+ ),
130
+ )
131
+
132
+ # dark5
133
+ self.dark5 = M.Sequential(
134
+ Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
135
+ SPPBottleneck(base_channels * 16, base_channels * 16, activation=act),
136
+ CSPLayer(
137
+ base_channels * 16, base_channels * 16, n=base_depth,
138
+ shortcut=False, depthwise=depthwise, act=act,
139
+ ),
140
+ )
141
+
142
+ def forward(self, x):
143
+ outputs = {}
144
+ x = self.stem(x)
145
+ outputs["stem"] = x
146
+ x = self.dark2(x)
147
+ outputs["dark2"] = x
148
+ x = self.dark3(x)
149
+ outputs["dark3"] = x
150
+ x = self.dark4(x)
151
+ outputs["dark4"] = x
152
+ x = self.dark5(x)
153
+ outputs["dark5"] = x
154
+ return {k: v for k, v in outputs.items() if k in self.out_features}
multimodal/YOLOX/demo/MegEngine/python/models/network_blocks.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright (c) Megvii Inc. All rights reserved.
4
+
5
+ import megengine.functional as F
6
+ import megengine.module as M
7
+
8
+
9
+ class UpSample(M.Module):
10
+
11
+ def __init__(self, scale_factor=2, mode="bilinear"):
12
+ super().__init__()
13
+ self.scale_factor = scale_factor
14
+ self.mode = mode
15
+
16
+ def forward(self, x):
17
+ return F.vision.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)
18
+
19
+
20
+ class SiLU(M.Module):
21
+ """export-friendly version of M.SiLU()"""
22
+
23
+ @staticmethod
24
+ def forward(x):
25
+ return x * F.sigmoid(x)
26
+
27
+
28
+ def get_activation(name="silu"):
29
+ if name == "silu":
30
+ module = SiLU()
31
+ elif name == "relu":
32
+ module = M.ReLU()
33
+ elif name == "lrelu":
34
+ module = M.LeakyReLU(0.1)
35
+ else:
36
+ raise AttributeError("Unsupported act type: {}".format(name))
37
+ return module
38
+
39
+
40
+ class BaseConv(M.Module):
41
+ """A Conv2d -> Batchnorm -> silu/leaky relu block"""
42
+
43
+ def __init__(self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"):
44
+ super().__init__()
45
+ # same padding
46
+ pad = (ksize - 1) // 2
47
+ self.conv = M.Conv2d(
48
+ in_channels,
49
+ out_channels,
50
+ kernel_size=ksize,
51
+ stride=stride,
52
+ padding=pad,
53
+ groups=groups,
54
+ bias=bias,
55
+ )
56
+ self.bn = M.BatchNorm2d(out_channels)
57
+ self.act = get_activation(act)
58
+
59
+ def forward(self, x):
60
+ return self.act(self.bn(self.conv(x)))
61
+
62
+ def fuseforward(self, x):
63
+ return self.act(self.conv(x))
64
+
65
+
66
+ class DWConv(M.Module):
67
+ """Depthwise Conv + Conv"""
68
+ def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
69
+ super().__init__()
70
+ self.dconv = BaseConv(
71
+ in_channels, in_channels, ksize=ksize,
72
+ stride=stride, groups=in_channels, act=act
73
+ )
74
+ self.pconv = BaseConv(
75
+ in_channels, out_channels, ksize=1,
76
+ stride=1, groups=1, act=act
77
+ )
78
+
79
+ def forward(self, x):
80
+ x = self.dconv(x)
81
+ return self.pconv(x)
82
+
83
+
84
+ class Bottleneck(M.Module):
85
+ # Standard bottleneck
86
+ def __init__(
87
+ self, in_channels, out_channels, shortcut=True,
88
+ expansion=0.5, depthwise=False, act="silu"
89
+ ):
90
+ super().__init__()
91
+ hidden_channels = int(out_channels * expansion)
92
+ Conv = DWConv if depthwise else BaseConv
93
+ self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
94
+ self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
95
+ self.use_add = shortcut and in_channels == out_channels
96
+
97
+ def forward(self, x):
98
+ y = self.conv2(self.conv1(x))
99
+ if self.use_add:
100
+ y = y + x
101
+ return y
102
+
103
+
104
+ class ResLayer(M.Module):
105
+ "Residual layer with `in_channels` inputs."
106
+ def __init__(self, in_channels: int):
107
+ super().__init__()
108
+ mid_channels = in_channels // 2
109
+ self.layer1 = BaseConv(in_channels, mid_channels, ksize=1, stride=1, act="lrelu")
110
+ self.layer2 = BaseConv(mid_channels, in_channels, ksize=3, stride=1, act="lrelu")
111
+
112
+ def forward(self, x):
113
+ out = self.layer2(self.layer1(x))
114
+ return x + out
115
+
116
+
117
+ class SPPBottleneck(M.Module):
118
+ """Spatial pyramid pooling layer used in YOLOv3-SPP"""
119
+ def __init__(self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"):
120
+ super().__init__()
121
+ hidden_channels = in_channels // 2
122
+ self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
123
+ self.m = [M.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) for ks in kernel_sizes]
124
+ conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
125
+ self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
126
+
127
+ def forward(self, x):
128
+ x = self.conv1(x)
129
+ x = F.concat([x] + [m(x) for m in self.m], axis=1)
130
+ x = self.conv2(x)
131
+ return x
132
+
133
+
134
+ class CSPLayer(M.Module):
135
+ """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
136
+
137
+ def __init__(
138
+ self, in_channels, out_channels, n=1,
139
+ shortcut=True, expansion=0.5, depthwise=False, act="silu"
140
+ ):
141
+ """
142
+ Args:
143
+ in_channels (int): input channels.
144
+ out_channels (int): output channels.
145
+ n (int): number of Bottlenecks. Default value: 1.
146
+ """
147
+ # ch_in, ch_out, number, shortcut, groups, expansion
148
+ super().__init__()
149
+ hidden_channels = int(out_channels * expansion) # hidden channels
150
+ self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
151
+ self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
152
+ self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
153
+ module_list = [
154
+ Bottleneck(hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act)
155
+ for _ in range(n)
156
+ ]
157
+ self.m = M.Sequential(*module_list)
158
+
159
+ def forward(self, x):
160
+ x_1 = self.conv1(x)
161
+ x_2 = self.conv2(x)
162
+ x_1 = self.m(x_1)
163
+ x = F.concat((x_1, x_2), axis=1)
164
+ return self.conv3(x)
165
+
166
+
167
+ class Focus(M.Module):
168
+ """Focus width and height information into channel space."""
169
+
170
+ def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
171
+ super().__init__()
172
+ self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act)
173
+
174
+ def forward(self, x):
175
+ # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
176
+ patch_top_left = x[..., ::2, ::2]
177
+ patch_top_right = x[..., ::2, 1::2]
178
+ patch_bot_left = x[..., 1::2, ::2]
179
+ patch_bot_right = x[..., 1::2, 1::2]
180
+ x = F.concat(
181
+ (patch_top_left, patch_bot_left, patch_top_right, patch_bot_right,), axis=1,
182
+ )
183
+ return self.conv(x)
multimodal/YOLOX/demo/MegEngine/python/models/yolo_fpn.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright (c) Megvii Inc. All rights reserved.
4
+
5
+ import megengine.functional as F
6
+ import megengine.module as M
7
+
8
+ from .darknet import Darknet
9
+ from .network_blocks import BaseConv, UpSample
10
+
11
+
12
+ class YOLOFPN(M.Module):
13
+ """
14
+ YOLOFPN module. Darknet 53 is the default backbone of this model.
15
+ """
16
+
17
+ def __init__(
18
+ self, depth=53, in_features=["dark3", "dark4", "dark5"],
19
+ ):
20
+ super().__init__()
21
+
22
+ self.backbone = Darknet(depth)
23
+ self.in_features = in_features
24
+
25
+ # out 1
26
+ self.out1_cbl = self._make_cbl(512, 256, 1)
27
+ self.out1 = self._make_embedding([256, 512], 512 + 256)
28
+
29
+ # out 2
30
+ self.out2_cbl = self._make_cbl(256, 128, 1)
31
+ self.out2 = self._make_embedding([128, 256], 256 + 128)
32
+
33
+ # upsample
34
+ self.upsample = UpSample(scale_factor=2, mode="bilinear")
35
+
36
+ def _make_cbl(self, _in, _out, ks):
37
+ return BaseConv(_in, _out, ks, stride=1, act="lrelu")
38
+
39
+ def _make_embedding(self, filters_list, in_filters):
40
+ m = M.Sequential(
41
+ *[
42
+ self._make_cbl(in_filters, filters_list[0], 1),
43
+ self._make_cbl(filters_list[0], filters_list[1], 3),
44
+
45
+ self._make_cbl(filters_list[1], filters_list[0], 1),
46
+
47
+ self._make_cbl(filters_list[0], filters_list[1], 3),
48
+ self._make_cbl(filters_list[1], filters_list[0], 1),
49
+ ]
50
+ )
51
+ return m
52
+
53
+ def forward(self, inputs):
54
+ """
55
+ Args:
56
+ inputs (Tensor): input image.
57
+
58
+ Returns:
59
+ Tuple[Tensor]: FPN output features..
60
+ """
61
+ # backbone
62
+ out_features = self.backbone(inputs)
63
+ x2, x1, x0 = [out_features[f] for f in self.in_features]
64
+
65
+ # yolo branch 1
66
+ x1_in = self.out1_cbl(x0)
67
+ x1_in = self.upsample(x1_in)
68
+ x1_in = F.concat([x1_in, x1], 1)
69
+ out_dark4 = self.out1(x1_in)
70
+
71
+ # yolo branch 2
72
+ x2_in = self.out2_cbl(out_dark4)
73
+ x2_in = self.upsample(x2_in)
74
+ x2_in = F.concat([x2_in, x2], 1)
75
+ out_dark3 = self.out2(x2_in)
76
+
77
+ outputs = (out_dark3, out_dark4, x0)
78
+ return outputs
multimodal/YOLOX/demo/MegEngine/python/models/yolo_head.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding:utf-8 -*-
3
+ # Copyright (c) Megvii Inc. All rights reserved.
4
+
5
+ import megengine.functional as F
6
+ import megengine.module as M
7
+
8
+ from .network_blocks import BaseConv, DWConv
9
+
10
+
11
+ def meshgrid(x, y):
12
+ """meshgrid wrapper for megengine"""
13
+ assert len(x.shape) == 1
14
+ assert len(y.shape) == 1
15
+ mesh_shape = (y.shape[0], x.shape[0])
16
+ mesh_x = F.broadcast_to(x, mesh_shape)
17
+ mesh_y = F.broadcast_to(y.reshape(-1, 1), mesh_shape)
18
+ return mesh_x, mesh_y
19
+
20
+
21
+ class YOLOXHead(M.Module):
22
+ def __init__(
23
+ self, num_classes, width=1.0, strides=[8, 16, 32],
24
+ in_channels=[256, 512, 1024], act="silu", depthwise=False
25
+ ):
26
+ """
27
+ Args:
28
+ act (str): activation type of conv. Defalut value: "silu".
29
+ depthwise (bool): whether apply depthwise conv in conv branch. Defalut value: False.
30
+ """
31
+ super().__init__()
32
+
33
+ self.n_anchors = 1
34
+ self.num_classes = num_classes
35
+ self.decode_in_inference = True # save for matching
36
+
37
+ self.cls_convs = []
38
+ self.reg_convs = []
39
+ self.cls_preds = []
40
+ self.reg_preds = []
41
+ self.obj_preds = []
42
+ self.stems = []
43
+ Conv = DWConv if depthwise else BaseConv
44
+
45
+ for i in range(len(in_channels)):
46
+ self.stems.append(
47
+ BaseConv(
48
+ in_channels=int(in_channels[i] * width),
49
+ out_channels=int(256 * width),
50
+ ksize=1,
51
+ stride=1,
52
+ act=act,
53
+ )
54
+ )
55
+ self.cls_convs.append(
56
+ M.Sequential(
57
+ *[
58
+ Conv(
59
+ in_channels=int(256 * width),
60
+ out_channels=int(256 * width),
61
+ ksize=3,
62
+ stride=1,
63
+ act=act,
64
+ ),
65
+ Conv(
66
+ in_channels=int(256 * width),
67
+ out_channels=int(256 * width),
68
+ ksize=3,
69
+ stride=1,
70
+ act=act,
71
+ ),
72
+ ]
73
+ )
74
+ )
75
+ self.reg_convs.append(
76
+ M.Sequential(
77
+ *[
78
+ Conv(
79
+ in_channels=int(256 * width),
80
+ out_channels=int(256 * width),
81
+ ksize=3,
82
+ stride=1,
83
+ act=act,
84
+ ),
85
+ Conv(
86
+ in_channels=int(256 * width),
87
+ out_channels=int(256 * width),
88
+ ksize=3,
89
+ stride=1,
90
+ act=act,
91
+ ),
92
+ ]
93
+ )
94
+ )
95
+ self.cls_preds.append(
96
+ M.Conv2d(
97
+ in_channels=int(256 * width),
98
+ out_channels=self.n_anchors * self.num_classes,
99
+ kernel_size=1,
100
+ stride=1,
101
+ padding=0,
102
+ )
103
+ )
104
+ self.reg_preds.append(
105
+ M.Conv2d(
106
+ in_channels=int(256 * width),
107
+ out_channels=4,
108
+ kernel_size=1,
109
+ stride=1,
110
+ padding=0,
111
+ )
112
+ )
113
+ self.obj_preds.append(
114
+ M.Conv2d(
115
+ in_channels=int(256 * width),
116
+ out_channels=self.n_anchors * 1,
117
+ kernel_size=1,
118
+ stride=1,
119
+ padding=0,
120
+ )
121
+ )
122
+
123
+ self.use_l1 = False
124
+ self.strides = strides
125
+ self.grids = [F.zeros(1)] * len(in_channels)
126
+
127
+ def forward(self, xin, labels=None, imgs=None):
128
+ outputs = []
129
+ assert not self.training
130
+
131
+ for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
132
+ zip(self.cls_convs, self.reg_convs, self.strides, xin)
133
+ ):
134
+ x = self.stems[k](x)
135
+ cls_x = x
136
+ reg_x = x
137
+
138
+ cls_feat = cls_conv(cls_x)
139
+ cls_output = self.cls_preds[k](cls_feat)
140
+
141
+ reg_feat = reg_conv(reg_x)
142
+ reg_output = self.reg_preds[k](reg_feat)
143
+ obj_output = self.obj_preds[k](reg_feat)
144
+ output = F.concat([reg_output, F.sigmoid(obj_output), F.sigmoid(cls_output)], 1)
145
+ outputs.append(output)
146
+
147
+ self.hw = [x.shape[-2:] for x in outputs]
148
+ # [batch, n_anchors_all, 85]
149
+ outputs = F.concat([F.flatten(x, start_axis=2) for x in outputs], axis=2)
150
+ outputs = F.transpose(outputs, (0, 2, 1))
151
+ if self.decode_in_inference:
152
+ return self.decode_outputs(outputs)
153
+ else:
154
+ return outputs
155
+
156
+ def get_output_and_grid(self, output, k, stride, dtype):
157
+ grid = self.grids[k]
158
+
159
+ batch_size = output.shape[0]
160
+ n_ch = 5 + self.num_classes
161
+ hsize, wsize = output.shape[-2:]
162
+ if grid.shape[2:4] != output.shape[2:4]:
163
+ yv, xv = meshgrid([F.arange(hsize), F.arange(wsize)])
164
+ grid = F.stack((xv, yv), 2).reshape(1, 1, hsize, wsize, 2).type(dtype)
165
+ self.grids[k] = grid
166
+
167
+ output = output.view(batch_size, self.n_anchors, n_ch, hsize, wsize)
168
+ output = (
169
+ output.permute(0, 1, 3, 4, 2)
170
+ .reshape(batch_size, self.n_anchors * hsize * wsize, -1)
171
+ )
172
+ grid = grid.view(1, -1, 2)
173
+ output[..., :2] = (output[..., :2] + grid) * stride
174
+ output[..., 2:4] = F.exp(output[..., 2:4]) * stride
175
+ return output, grid
176
+
177
+ def decode_outputs(self, outputs):
178
+ grids = []
179
+ strides = []
180
+ for (hsize, wsize), stride in zip(self.hw, self.strides):
181
+ xv, yv = meshgrid(F.arange(hsize), F.arange(wsize))
182
+ grid = F.stack((xv, yv), 2).reshape(1, -1, 2)
183
+ grids.append(grid)
184
+ shape = grid.shape[:2]
185
+ strides.append(F.full((*shape, 1), stride))
186
+
187
+ grids = F.concat(grids, axis=1)
188
+ strides = F.concat(strides, axis=1)
189
+
190
+ outputs[..., :2] = (outputs[..., :2] + grids) * strides
191
+ outputs[..., 2:4] = F.exp(outputs[..., 2:4]) * strides
192
+ return outputs
multimodal/YOLOX/demo/MegEngine/python/models/yolo_pafpn.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright (c) Megvii Inc. All rights reserved.
4
+
5
+ import megengine.module as M
6
+ import megengine.functional as F
7
+
8
+ from .darknet import CSPDarknet
9
+ from .network_blocks import BaseConv, CSPLayer, DWConv, UpSample
10
+
11
+
12
+ class YOLOPAFPN(M.Module):
13
+ """
14
+ YOLOv3 model. Darknet 53 is the default backbone of this model.
15
+ """
16
+
17
+ def __init__(
18
+ self, depth=1.0, width=1.0, in_features=("dark3", "dark4", "dark5"),
19
+ in_channels=[256, 512, 1024], depthwise=False, act="silu",
20
+ ):
21
+ super().__init__()
22
+ self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
23
+ self.in_features = in_features
24
+ self.in_channels = in_channels
25
+ Conv = DWConv if depthwise else BaseConv
26
+
27
+ self.upsample = UpSample(scale_factor=2, mode="bilinear")
28
+ self.lateral_conv0 = BaseConv(
29
+ int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act
30
+ )
31
+ self.C3_p4 = CSPLayer(
32
+ int(2 * in_channels[1] * width),
33
+ int(in_channels[1] * width),
34
+ round(3 * depth),
35
+ False,
36
+ depthwise=depthwise,
37
+ act=act,
38
+ ) # cat
39
+
40
+ self.reduce_conv1 = BaseConv(
41
+ int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act
42
+ )
43
+ self.C3_p3 = CSPLayer(
44
+ int(2 * in_channels[0] * width),
45
+ int(in_channels[0] * width),
46
+ round(3 * depth),
47
+ False,
48
+ depthwise=depthwise,
49
+ act=act,
50
+ )
51
+
52
+ # bottom-up conv
53
+ self.bu_conv2 = Conv(
54
+ int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act
55
+ )
56
+ self.C3_n3 = CSPLayer(
57
+ int(2 * in_channels[0] * width),
58
+ int(in_channels[1] * width),
59
+ round(3 * depth),
60
+ False,
61
+ depthwise=depthwise,
62
+ act=act,
63
+ )
64
+
65
+ # bottom-up conv
66
+ self.bu_conv1 = Conv(
67
+ int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act
68
+ )
69
+ self.C3_n4 = CSPLayer(
70
+ int(2 * in_channels[1] * width),
71
+ int(in_channels[2] * width),
72
+ round(3 * depth),
73
+ False,
74
+ depthwise=depthwise,
75
+ act=act,
76
+ )
77
+
78
+ def forward(self, input):
79
+ """
80
+ Args:
81
+ inputs: input images.
82
+
83
+ Returns:
84
+ Tuple[Tensor]: FPN feature.
85
+ """
86
+
87
+ # backbone
88
+ out_features = self.backbone(input)
89
+ features = [out_features[f] for f in self.in_features]
90
+ [x2, x1, x0] = features
91
+
92
+ fpn_out0 = self.lateral_conv0(x0) # 1024->512/32
93
+ f_out0 = self.upsample(fpn_out0) # 512/16
94
+ f_out0 = F.concat([f_out0, x1], 1) # 512->1024/16
95
+ f_out0 = self.C3_p4(f_out0) # 1024->512/16
96
+
97
+ fpn_out1 = self.reduce_conv1(f_out0) # 512->256/16
98
+ f_out1 = self.upsample(fpn_out1) # 256/8
99
+ f_out1 = F.concat([f_out1, x2], 1) # 256->512/8
100
+ pan_out2 = self.C3_p3(f_out1) # 512->256/8
101
+
102
+ p_out1 = self.bu_conv2(pan_out2) # 256->256/16
103
+ p_out1 = F.concat([p_out1, fpn_out1], 1) # 256->512/16
104
+ pan_out1 = self.C3_n3(p_out1) # 512->512/16
105
+
106
+ p_out0 = self.bu_conv1(pan_out1) # 512->512/32
107
+ p_out0 = F.concat([p_out0, fpn_out0], 1) # 512->1024/32
108
+ pan_out0 = self.C3_n4(p_out0) # 1024->1024/32
109
+
110
+ outputs = (pan_out2, pan_out1, pan_out0)
111
+ return outputs
multimodal/YOLOX/demo/MegEngine/python/models/yolox.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf-8 -*-
3
+ # Copyright (c) Megvii Inc. All rights reserved.
4
+
5
+ import megengine.module as M
6
+
7
+ from .yolo_head import YOLOXHead
8
+ from .yolo_pafpn import YOLOPAFPN
9
+
10
+
11
+ class YOLOX(M.Module):
12
+ """
13
+ YOLOX model module. The module list is defined by create_yolov3_modules function.
14
+ The network returns loss values from three YOLO layers during training
15
+ and detection results during test.
16
+ """
17
+
18
+ def __init__(self, backbone=None, head=None):
19
+ super().__init__()
20
+ if backbone is None:
21
+ backbone = YOLOPAFPN()
22
+ if head is None:
23
+ head = YOLOXHead(80)
24
+
25
+ self.backbone = backbone
26
+ self.head = head
27
+
28
+ def forward(self, x):
29
+ # fpn output content features of [dark3, dark4, dark5]
30
+ fpn_outs = self.backbone(x)
31
+ assert not self.training
32
+ outputs = self.head(fpn_outs)
33
+
34
+ return outputs
multimodal/YOLOX/demo/ONNXRuntime/README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## YOLOX-ONNXRuntime in Python
2
+
3
+ This doc introduces how to convert your pytorch model into onnx, and how to run an onnxruntime demo to verify your convertion.
4
+
5
+ ### Step1: Install onnxruntime
6
+
7
+ run the following command to install onnxruntime:
8
+ ```shell
9
+ pip install onnxruntime
10
+ ```
11
+
12
+ ### Step2: Get ONNX models
13
+
14
+ Users might download our pre-generated ONNX models or convert their own models to ONNX.
15
+
16
+ #### Download ONNX models.
17
+
18
+ | Model | Parameters | GFLOPs | Test Size | mAP | Weights |
19
+ |:------| :----: | :----: | :---: | :---: | :---: |
20
+ | YOLOX-Nano | 0.91M | 1.08 | 416x416 | 25.8 |[github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_nano.onnx) |
21
+ | YOLOX-Tiny | 5.06M | 6.45 | 416x416 |32.8 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_tiny.onnx) |
22
+ | YOLOX-S | 9.0M | 26.8 | 640x640 |40.5 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s.onnx) |
23
+ | YOLOX-M | 25.3M | 73.8 | 640x640 |47.2 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_m.onnx) |
24
+ | YOLOX-L | 54.2M | 155.6 | 640x640 |50.1 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_l.onnx) |
25
+ | YOLOX-Darknet53| 63.72M | 185.3 | 640x640 |48.0 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_darknet.onnx) |
26
+ | YOLOX-X | 99.1M | 281.9 | 640x640 |51.5 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_x.onnx) |
27
+
28
+ #### Convert Your Model to ONNX
29
+
30
+ First, you should move to <YOLOX_HOME> by:
31
+ ```shell
32
+ cd <YOLOX_HOME>
33
+ ```
34
+ Then, you can:
35
+
36
+ 1. Convert a standard YOLOX model by -n:
37
+ ```shell
38
+ python3 tools/export_onnx.py --output-name yolox_s.onnx -n yolox-s -c yolox_s.pth
39
+ ```
40
+ Notes:
41
+ * -n: specify a model name. The model name must be one of the [yolox-s,m,l,x and yolox-nano, yolox-tiny, yolov3]
42
+ * -c: the model you have trained
43
+ * -o: opset version, default 11. **However, if you will further convert your onnx model to [OpenVINO](https://github.com/Megvii-BaseDetection/YOLOX/demo/OpenVINO/), please specify the opset version to 10.**
44
+ * --no-onnxsim: disable onnxsim
45
+ * To customize an input shape for onnx model, modify the following code in tools/export.py:
46
+
47
+ ```python
48
+ dummy_input = torch.randn(1, 3, exp.test_size[0], exp.test_size[1])
49
+ ```
50
+
51
+ 1. Convert a standard YOLOX model by -f. When using -f, the above command is equivalent to:
52
+
53
+ ```shell
54
+ python3 tools/export_onnx.py --output-name yolox_s.onnx -f exps/default/yolox_s.py -c yolox_s.pth
55
+ ```
56
+
57
+ 3. To convert your customized model, please use -f:
58
+
59
+ ```shell
60
+ python3 tools/export_onnx.py --output-name your_yolox.onnx -f exps/your_dir/your_yolox.py -c your_yolox.pth
61
+ ```
62
+
63
+ ### Step3: ONNXRuntime Demo
64
+
65
+ Step1.
66
+ ```shell
67
+ cd <YOLOX_HOME>/demo/ONNXRuntime
68
+ ```
69
+
70
+ Step2.
71
+ ```shell
72
+ python3 onnx_inference.py -m <ONNX_MODEL_PATH> -i <IMAGE_PATH> -o <OUTPUT_DIR> -s 0.3 --input_shape 640,640
73
+ ```
74
+ Notes:
75
+ * -m: your converted onnx model
76
+ * -i: input_image
77
+ * -s: score threshold for visualization.
78
+ * --input_shape: should be consistent with the shape you used for onnx convertion.
multimodal/YOLOX/demo/ONNXRuntime/onnx_inference.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Megvii, Inc. and its affiliates.
3
+
4
+ import argparse
5
+ import os
6
+
7
+ import cv2
8
+ import numpy as np
9
+
10
+ import onnxruntime
11
+
12
+ from yolox.data.data_augment import preproc as preprocess
13
+ from yolox.data.datasets import COCO_CLASSES
14
+ from yolox.utils import mkdir, multiclass_nms, demo_postprocess, vis
15
+
16
+
17
+ def make_parser():
18
+ parser = argparse.ArgumentParser("onnxruntime inference sample")
19
+ parser.add_argument(
20
+ "-m",
21
+ "--model",
22
+ type=str,
23
+ default="yolox.onnx",
24
+ help="Input your onnx model.",
25
+ )
26
+ parser.add_argument(
27
+ "-i",
28
+ "--image_path",
29
+ type=str,
30
+ default='test_image.png',
31
+ help="Path to your input image.",
32
+ )
33
+ parser.add_argument(
34
+ "-o",
35
+ "--output_dir",
36
+ type=str,
37
+ default='demo_output',
38
+ help="Path to your output directory.",
39
+ )
40
+ parser.add_argument(
41
+ "-s",
42
+ "--score_thr",
43
+ type=float,
44
+ default=0.3,
45
+ help="Score threshould to filter the result.",
46
+ )
47
+ parser.add_argument(
48
+ "--input_shape",
49
+ type=str,
50
+ default="640,640",
51
+ help="Specify an input shape for inference.",
52
+ )
53
+ return parser
54
+
55
+
56
+ if __name__ == '__main__':
57
+ args = make_parser().parse_args()
58
+
59
+ input_shape = tuple(map(int, args.input_shape.split(',')))
60
+ origin_img = cv2.imread(args.image_path)
61
+ img, ratio = preprocess(origin_img, input_shape)
62
+
63
+ session = onnxruntime.InferenceSession(args.model)
64
+
65
+ ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
66
+ output = session.run(None, ort_inputs)
67
+ predictions = demo_postprocess(output[0], input_shape)[0]
68
+
69
+ boxes = predictions[:, :4]
70
+ scores = predictions[:, 4:5] * predictions[:, 5:]
71
+
72
+ boxes_xyxy = np.ones_like(boxes)
73
+ boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
74
+ boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
75
+ boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
76
+ boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
77
+ boxes_xyxy /= ratio
78
+ dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
79
+ if dets is not None:
80
+ final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
81
+ origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds,
82
+ conf=args.score_thr, class_names=COCO_CLASSES)
83
+
84
+ mkdir(args.output_dir)
85
+ output_path = os.path.join(args.output_dir, os.path.basename(args.image_path))
86
+ cv2.imwrite(output_path, origin_img)
multimodal/YOLOX/demo/OpenVINO/README.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ## YOLOX for OpenVINO
2
+
3
+ * [C++ Demo](./cpp)
4
+ * [Python Demo](./python)
multimodal/YOLOX/demo/OpenVINO/cpp/CMakeLists.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 3.4.1)
2
+ set(CMAKE_CXX_STANDARD 14)
3
+
4
+ project(yolox_openvino_demo)
5
+
6
+ find_package(OpenCV REQUIRED)
7
+ find_package(InferenceEngine REQUIRED)
8
+ find_package(ngraph REQUIRED)
9
+
10
+ include_directories(
11
+ ${OpenCV_INCLUDE_DIRS}
12
+ ${CMAKE_CURRENT_SOURCE_DIR}
13
+ ${CMAKE_CURRENT_BINARY_DIR}
14
+ )
15
+
16
+ add_executable(yolox_openvino yolox_openvino.cpp)
17
+
18
+ target_link_libraries(
19
+ yolox_openvino
20
+ ${InferenceEngine_LIBRARIES}
21
+ ${NGRAPH_LIBRARIES}
22
+ ${OpenCV_LIBS}
23
+ )
multimodal/YOLOX/demo/OpenVINO/cpp/README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # YOLOX-OpenVINO in C++
2
+
3
+ This tutorial includes a C++ demo for OpenVINO, as well as some converted models.
4
+
5
+ ### Download OpenVINO models.
6
+
7
+ | Model | Parameters | GFLOPs | Test Size | mAP | Weights |
8
+ |:------| :----: | :----: | :---: | :---: | :---: |
9
+ | [YOLOX-Nano](../../../exps/default/nano.py) | 0.91M | 1.08 | 416x416 | 25.8 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_nano_openvino.tar.gz) |
10
+ | [YOLOX-Tiny](../../../exps/default/yolox_tiny.py) | 5.06M | 6.45 | 416x416 |32.8 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_tiny_openvino.tar.gz) |
11
+ | [YOLOX-S](../../../exps/default/yolox_s.py) | 9.0M | 26.8 | 640x640 |40.5 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s_openvino.tar.gz) |
12
+ | [YOLOX-M](../../../exps/default/yolox_m.py) | 25.3M | 73.8 | 640x640 |47.2 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_m_openvino.tar.gz) |
13
+ | [YOLOX-L](../../../exps/default/yolox_l.py) | 54.2M | 155.6 | 640x640 |50.1 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_l_openvino.tar.gz) |
14
+ | [YOLOX-Darknet53](../../../exps/default/yolov3.py) | 63.72M | 185.3 | 640x640 |48.0 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_dark_openvino.tar.gz) |
15
+ | [YOLOX-X](../../../exps/default/yolox_x.py) | 99.1M | 281.9 | 640x640 |51.5 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_x_openvino.tar.gz) |
16
+
17
+ ## Install OpenVINO Toolkit
18
+
19
+ Please visit [Openvino Homepage](https://docs.openvinotoolkit.org/latest/get_started_guides.html) for more details.
20
+
21
+ ## Set up the Environment
22
+
23
+ ### For Linux
24
+
25
+ **Option1. Set up the environment tempororally. You need to run this command everytime you start a new shell window.**
26
+
27
+ ```shell
28
+ source /opt/intel/openvino_2021/bin/setupvars.sh
29
+ ```
30
+
31
+ **Option2. Set up the environment permenantly.**
32
+
33
+ *Step1.* For Linux:
34
+ ```shell
35
+ vim ~/.bashrc
36
+ ```
37
+
38
+ *Step2.* Add the following line into your file:
39
+
40
+ ```shell
41
+ source /opt/intel/openvino_2021/bin/setupvars.sh
42
+ ```
43
+
44
+ *Step3.* Save and exit the file, then run:
45
+
46
+ ```shell
47
+ source ~/.bashrc
48
+ ```
49
+
50
+
51
+ ## Convert model
52
+
53
+ 1. Export ONNX model
54
+
55
+ Please refer to the [ONNX tutorial](../../ONNXRuntime). **Note that you should set --opset to 10, otherwise your next step will fail.**
56
+
57
+ 2. Convert ONNX to OpenVINO
58
+
59
+ ``` shell
60
+ cd <INSTSLL_DIR>/openvino_2021/deployment_tools/model_optimizer
61
+ ```
62
+
63
+ Install requirements for convert tool
64
+
65
+ ```shell
66
+ sudo ./install_prerequisites/install_prerequisites_onnx.sh
67
+ ```
68
+
69
+ Then convert model.
70
+ ```shell
71
+ python3 mo.py --input_model <ONNX_MODEL> --input_shape <INPUT_SHAPE> [--data_type FP16]
72
+ ```
73
+ For example:
74
+ ```shell
75
+ python3 mo.py --input_model yolox_tiny.onnx --input_shape [1,3,416,416] --data_type FP16
76
+ ```
77
+
78
+ Make sure the input shape is consistent with [those](yolox_openvino.cpp#L24-L25) in cpp file.
79
+
80
+ ## Build
81
+
82
+ ### Linux
83
+ ```shell
84
+ source /opt/intel/openvino_2021/bin/setupvars.sh
85
+ mkdir build
86
+ cd build
87
+ cmake ..
88
+ make
89
+ ```
90
+
91
+ ## Demo
92
+
93
+ ### c++
94
+
95
+ ```shell
96
+ ./yolox_openvino <XML_MODEL_PATH> <IMAGE_PATH> <DEVICE>
97
+ ```
multimodal/YOLOX/demo/OpenVINO/cpp/yolox_openvino.cpp ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2021 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #include <iterator>
6
+ #include <memory>
7
+ #include <string>
8
+ #include <vector>
9
+ #include <opencv2/opencv.hpp>
10
+ #include <iostream>
11
+ #include <inference_engine.hpp>
12
+
13
+ using namespace InferenceEngine;
14
+
15
+ /**
16
+ * @brief Define names based depends on Unicode path support
17
+ */
18
+ #define tcout std::cout
19
+ #define file_name_t std::string
20
+ #define imread_t cv::imread
21
+ #define NMS_THRESH 0.45
22
+ #define BBOX_CONF_THRESH 0.3
23
+
24
+ static const int INPUT_W = 416;
25
+ static const int INPUT_H = 416;
26
+ static const int NUM_CLASSES = 80; // COCO has 80 classes. Modify this value on your own dataset.
27
+
28
+ cv::Mat static_resize(cv::Mat& img) {
29
+ float r = std::min(INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0));
30
+ // r = std::min(r, 1.0f);
31
+ int unpad_w = r * img.cols;
32
+ int unpad_h = r * img.rows;
33
+ cv::Mat re(unpad_h, unpad_w, CV_8UC3);
34
+ cv::resize(img, re, re.size());
35
+ //cv::Mat out(INPUT_W, INPUT_H, CV_8UC3, cv::Scalar(114, 114, 114));
36
+ cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(114, 114, 114));
37
+ re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
38
+ return out;
39
+ }
40
+
41
+ void blobFromImage(cv::Mat& img, Blob::Ptr& blob){
42
+ int channels = 3;
43
+ int img_h = img.rows;
44
+ int img_w = img.cols;
45
+ InferenceEngine::MemoryBlob::Ptr mblob = InferenceEngine::as<InferenceEngine::MemoryBlob>(blob);
46
+ if (!mblob)
47
+ {
48
+ THROW_IE_EXCEPTION << "We expect blob to be inherited from MemoryBlob in matU8ToBlob, "
49
+ << "but by fact we were not able to cast inputBlob to MemoryBlob";
50
+ }
51
+ // locked memory holder should be alive all time while access to its buffer happens
52
+ auto mblobHolder = mblob->wmap();
53
+
54
+ float *blob_data = mblobHolder.as<float *>();
55
+
56
+ for (size_t c = 0; c < channels; c++)
57
+ {
58
+ for (size_t h = 0; h < img_h; h++)
59
+ {
60
+ for (size_t w = 0; w < img_w; w++)
61
+ {
62
+ blob_data[c * img_w * img_h + h * img_w + w] =
63
+ (float)img.at<cv::Vec3b>(h, w)[c];
64
+ }
65
+ }
66
+ }
67
+ }
68
+
69
+
70
+ struct Object
71
+ {
72
+ cv::Rect_<float> rect;
73
+ int label;
74
+ float prob;
75
+ };
76
+
77
+ struct GridAndStride
78
+ {
79
+ int grid0;
80
+ int grid1;
81
+ int stride;
82
+ };
83
+
84
+ static void generate_grids_and_stride(const int target_w, const int target_h, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
85
+ {
86
+ for (auto stride : strides)
87
+ {
88
+ int num_grid_w = target_w / stride;
89
+ int num_grid_h = target_h / stride;
90
+ for (int g1 = 0; g1 < num_grid_h; g1++)
91
+ {
92
+ for (int g0 = 0; g0 < num_grid_w; g0++)
93
+ {
94
+ grid_strides.push_back((GridAndStride){g0, g1, stride});
95
+ }
96
+ }
97
+ }
98
+ }
99
+
100
+
101
+ static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, const float* feat_ptr, float prob_threshold, std::vector<Object>& objects)
102
+ {
103
+
104
+ const int num_anchors = grid_strides.size();
105
+
106
+ for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
107
+ {
108
+ const int grid0 = grid_strides[anchor_idx].grid0;
109
+ const int grid1 = grid_strides[anchor_idx].grid1;
110
+ const int stride = grid_strides[anchor_idx].stride;
111
+
112
+ const int basic_pos = anchor_idx * (NUM_CLASSES + 5);
113
+
114
+ // yolox/models/yolo_head.py decode logic
115
+ // outputs[..., :2] = (outputs[..., :2] + grids) * strides
116
+ // outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
117
+ float x_center = (feat_ptr[basic_pos + 0] + grid0) * stride;
118
+ float y_center = (feat_ptr[basic_pos + 1] + grid1) * stride;
119
+ float w = exp(feat_ptr[basic_pos + 2]) * stride;
120
+ float h = exp(feat_ptr[basic_pos + 3]) * stride;
121
+ float x0 = x_center - w * 0.5f;
122
+ float y0 = y_center - h * 0.5f;
123
+
124
+ float box_objectness = feat_ptr[basic_pos + 4];
125
+ for (int class_idx = 0; class_idx < NUM_CLASSES; class_idx++)
126
+ {
127
+ float box_cls_score = feat_ptr[basic_pos + 5 + class_idx];
128
+ float box_prob = box_objectness * box_cls_score;
129
+ if (box_prob > prob_threshold)
130
+ {
131
+ Object obj;
132
+ obj.rect.x = x0;
133
+ obj.rect.y = y0;
134
+ obj.rect.width = w;
135
+ obj.rect.height = h;
136
+ obj.label = class_idx;
137
+ obj.prob = box_prob;
138
+
139
+ objects.push_back(obj);
140
+ }
141
+
142
+ } // class loop
143
+
144
+ } // point anchor loop
145
+ }
146
+
147
+ static inline float intersection_area(const Object& a, const Object& b)
148
+ {
149
+ cv::Rect_<float> inter = a.rect & b.rect;
150
+ return inter.area();
151
+ }
152
+
153
+ static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
154
+ {
155
+ int i = left;
156
+ int j = right;
157
+ float p = faceobjects[(left + right) / 2].prob;
158
+
159
+ while (i <= j)
160
+ {
161
+ while (faceobjects[i].prob > p)
162
+ i++;
163
+
164
+ while (faceobjects[j].prob < p)
165
+ j--;
166
+
167
+ if (i <= j)
168
+ {
169
+ // swap
170
+ std::swap(faceobjects[i], faceobjects[j]);
171
+
172
+ i++;
173
+ j--;
174
+ }
175
+ }
176
+
177
+ #pragma omp parallel sections
178
+ {
179
+ #pragma omp section
180
+ {
181
+ if (left < j) qsort_descent_inplace(faceobjects, left, j);
182
+ }
183
+ #pragma omp section
184
+ {
185
+ if (i < right) qsort_descent_inplace(faceobjects, i, right);
186
+ }
187
+ }
188
+ }
189
+
190
+
191
+ static void qsort_descent_inplace(std::vector<Object>& objects)
192
+ {
193
+ if (objects.empty())
194
+ return;
195
+
196
+ qsort_descent_inplace(objects, 0, objects.size() - 1);
197
+ }
198
+
199
+ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
200
+ {
201
+ picked.clear();
202
+
203
+ const int n = faceobjects.size();
204
+
205
+ std::vector<float> areas(n);
206
+ for (int i = 0; i < n; i++)
207
+ {
208
+ areas[i] = faceobjects[i].rect.area();
209
+ }
210
+
211
+ for (int i = 0; i < n; i++)
212
+ {
213
+ const Object& a = faceobjects[i];
214
+
215
+ int keep = 1;
216
+ for (int j = 0; j < (int)picked.size(); j++)
217
+ {
218
+ const Object& b = faceobjects[picked[j]];
219
+
220
+ // intersection over union
221
+ float inter_area = intersection_area(a, b);
222
+ float union_area = areas[i] + areas[picked[j]] - inter_area;
223
+ // float IoU = inter_area / union_area
224
+ if (inter_area / union_area > nms_threshold)
225
+ keep = 0;
226
+ }
227
+
228
+ if (keep)
229
+ picked.push_back(i);
230
+ }
231
+ }
232
+
233
+
234
+ static void decode_outputs(const float* prob, std::vector<Object>& objects, float scale, const int img_w, const int img_h) {
235
+ std::vector<Object> proposals;
236
+ std::vector<int> strides = {8, 16, 32};
237
+ std::vector<GridAndStride> grid_strides;
238
+
239
+ generate_grids_and_stride(INPUT_W, INPUT_H, strides, grid_strides);
240
+ generate_yolox_proposals(grid_strides, prob, BBOX_CONF_THRESH, proposals);
241
+ qsort_descent_inplace(proposals);
242
+
243
+ std::vector<int> picked;
244
+ nms_sorted_bboxes(proposals, picked, NMS_THRESH);
245
+ int count = picked.size();
246
+ objects.resize(count);
247
+
248
+ for (int i = 0; i < count; i++)
249
+ {
250
+ objects[i] = proposals[picked[i]];
251
+
252
+ // adjust offset to original unpadded
253
+ float x0 = (objects[i].rect.x) / scale;
254
+ float y0 = (objects[i].rect.y) / scale;
255
+ float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
256
+ float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
257
+
258
+ // clip
259
+ x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
260
+ y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
261
+ x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
262
+ y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
263
+
264
+ objects[i].rect.x = x0;
265
+ objects[i].rect.y = y0;
266
+ objects[i].rect.width = x1 - x0;
267
+ objects[i].rect.height = y1 - y0;
268
+ }
269
+ }
270
+
271
+ const float color_list[80][3] =
272
+ {
273
+ {0.000, 0.447, 0.741},
274
+ {0.850, 0.325, 0.098},
275
+ {0.929, 0.694, 0.125},
276
+ {0.494, 0.184, 0.556},
277
+ {0.466, 0.674, 0.188},
278
+ {0.301, 0.745, 0.933},
279
+ {0.635, 0.078, 0.184},
280
+ {0.300, 0.300, 0.300},
281
+ {0.600, 0.600, 0.600},
282
+ {1.000, 0.000, 0.000},
283
+ {1.000, 0.500, 0.000},
284
+ {0.749, 0.749, 0.000},
285
+ {0.000, 1.000, 0.000},
286
+ {0.000, 0.000, 1.000},
287
+ {0.667, 0.000, 1.000},
288
+ {0.333, 0.333, 0.000},
289
+ {0.333, 0.667, 0.000},
290
+ {0.333, 1.000, 0.000},
291
+ {0.667, 0.333, 0.000},
292
+ {0.667, 0.667, 0.000},
293
+ {0.667, 1.000, 0.000},
294
+ {1.000, 0.333, 0.000},
295
+ {1.000, 0.667, 0.000},
296
+ {1.000, 1.000, 0.000},
297
+ {0.000, 0.333, 0.500},
298
+ {0.000, 0.667, 0.500},
299
+ {0.000, 1.000, 0.500},
300
+ {0.333, 0.000, 0.500},
301
+ {0.333, 0.333, 0.500},
302
+ {0.333, 0.667, 0.500},
303
+ {0.333, 1.000, 0.500},
304
+ {0.667, 0.000, 0.500},
305
+ {0.667, 0.333, 0.500},
306
+ {0.667, 0.667, 0.500},
307
+ {0.667, 1.000, 0.500},
308
+ {1.000, 0.000, 0.500},
309
+ {1.000, 0.333, 0.500},
310
+ {1.000, 0.667, 0.500},
311
+ {1.000, 1.000, 0.500},
312
+ {0.000, 0.333, 1.000},
313
+ {0.000, 0.667, 1.000},
314
+ {0.000, 1.000, 1.000},
315
+ {0.333, 0.000, 1.000},
316
+ {0.333, 0.333, 1.000},
317
+ {0.333, 0.667, 1.000},
318
+ {0.333, 1.000, 1.000},
319
+ {0.667, 0.000, 1.000},
320
+ {0.667, 0.333, 1.000},
321
+ {0.667, 0.667, 1.000},
322
+ {0.667, 1.000, 1.000},
323
+ {1.000, 0.000, 1.000},
324
+ {1.000, 0.333, 1.000},
325
+ {1.000, 0.667, 1.000},
326
+ {0.333, 0.000, 0.000},
327
+ {0.500, 0.000, 0.000},
328
+ {0.667, 0.000, 0.000},
329
+ {0.833, 0.000, 0.000},
330
+ {1.000, 0.000, 0.000},
331
+ {0.000, 0.167, 0.000},
332
+ {0.000, 0.333, 0.000},
333
+ {0.000, 0.500, 0.000},
334
+ {0.000, 0.667, 0.000},
335
+ {0.000, 0.833, 0.000},
336
+ {0.000, 1.000, 0.000},
337
+ {0.000, 0.000, 0.167},
338
+ {0.000, 0.000, 0.333},
339
+ {0.000, 0.000, 0.500},
340
+ {0.000, 0.000, 0.667},
341
+ {0.000, 0.000, 0.833},
342
+ {0.000, 0.000, 1.000},
343
+ {0.000, 0.000, 0.000},
344
+ {0.143, 0.143, 0.143},
345
+ {0.286, 0.286, 0.286},
346
+ {0.429, 0.429, 0.429},
347
+ {0.571, 0.571, 0.571},
348
+ {0.714, 0.714, 0.714},
349
+ {0.857, 0.857, 0.857},
350
+ {0.000, 0.447, 0.741},
351
+ {0.314, 0.717, 0.741},
352
+ {0.50, 0.5, 0}
353
+ };
354
+
355
+ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
356
+ {
357
+ static const char* class_names[] = {
358
+ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
359
+ "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
360
+ "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
361
+ "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
362
+ "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
363
+ "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
364
+ "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
365
+ "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
366
+ "hair drier", "toothbrush"
367
+ };
368
+
369
+ cv::Mat image = bgr.clone();
370
+
371
+ for (size_t i = 0; i < objects.size(); i++)
372
+ {
373
+ const Object& obj = objects[i];
374
+
375
+ fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
376
+ obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
377
+
378
+ cv::Scalar color = cv::Scalar(color_list[obj.label][0], color_list[obj.label][1], color_list[obj.label][2]);
379
+ float c_mean = cv::mean(color)[0];
380
+ cv::Scalar txt_color;
381
+ if (c_mean > 0.5){
382
+ txt_color = cv::Scalar(0, 0, 0);
383
+ }else{
384
+ txt_color = cv::Scalar(255, 255, 255);
385
+ }
386
+
387
+ cv::rectangle(image, obj.rect, color * 255, 2);
388
+
389
+ char text[256];
390
+ sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
391
+
392
+ int baseLine = 0;
393
+ cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
394
+
395
+ cv::Scalar txt_bk_color = color * 0.7 * 255;
396
+
397
+ int x = obj.rect.x;
398
+ int y = obj.rect.y + 1;
399
+ //int y = obj.rect.y - label_size.height - baseLine;
400
+ if (y > image.rows)
401
+ y = image.rows;
402
+ //if (x + label_size.width > image.cols)
403
+ //x = image.cols - label_size.width;
404
+
405
+ cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
406
+ txt_bk_color, -1);
407
+
408
+ cv::putText(image, text, cv::Point(x, y + label_size.height),
409
+ cv::FONT_HERSHEY_SIMPLEX, 0.4, txt_color, 1);
410
+ }
411
+
412
+ cv::imwrite("_demo.jpg" , image);
413
+ fprintf(stderr, "save vis file\n");
414
+ /* cv::imshow("image", image); */
415
+ /* cv::waitKey(0); */
416
+ }
417
+
418
+
419
+ int main(int argc, char* argv[]) {
420
+ try {
421
+ // ------------------------------ Parsing and validation of input arguments
422
+ // ---------------------------------
423
+ if (argc != 4) {
424
+ tcout << "Usage : " << argv[0] << " <path_to_model> <path_to_image> <device_name>" << std::endl;
425
+ return EXIT_FAILURE;
426
+ }
427
+
428
+ const file_name_t input_model {argv[1]};
429
+ const file_name_t input_image_path {argv[2]};
430
+ const std::string device_name {argv[3]};
431
+ // -----------------------------------------------------------------------------------------------------
432
+
433
+ // --------------------------- Step 1. Initialize inference engine core
434
+ // -------------------------------------
435
+ Core ie;
436
+ // -----------------------------------------------------------------------------------------------------
437
+
438
+ // Step 2. Read a model in OpenVINO Intermediate Representation (.xml and
439
+ // .bin files) or ONNX (.onnx file) format
440
+ CNNNetwork network = ie.ReadNetwork(input_model);
441
+ if (network.getOutputsInfo().size() != 1)
442
+ throw std::logic_error("Sample supports topologies with 1 output only");
443
+ if (network.getInputsInfo().size() != 1)
444
+ throw std::logic_error("Sample supports topologies with 1 input only");
445
+ // -----------------------------------------------------------------------------------------------------
446
+
447
+ // --------------------------- Step 3. Configure input & output
448
+ // ---------------------------------------------
449
+ // --------------------------- Prepare input blobs
450
+ // -----------------------------------------------------
451
+ InputInfo::Ptr input_info = network.getInputsInfo().begin()->second;
452
+ std::string input_name = network.getInputsInfo().begin()->first;
453
+
454
+ /* Mark input as resizable by setting of a resize algorithm.
455
+ * In this case we will be able to set an input blob of any shape to an
456
+ * infer request. Resize and layout conversions are executed automatically
457
+ * during inference */
458
+ //input_info->getPreProcess().setResizeAlgorithm(RESIZE_BILINEAR);
459
+ //input_info->setLayout(Layout::NHWC);
460
+ //input_info->setPrecision(Precision::FP32);
461
+
462
+ // --------------------------- Prepare output blobs
463
+ // ----------------------------------------------------
464
+ if (network.getOutputsInfo().empty()) {
465
+ std::cerr << "Network outputs info is empty" << std::endl;
466
+ return EXIT_FAILURE;
467
+ }
468
+ DataPtr output_info = network.getOutputsInfo().begin()->second;
469
+ std::string output_name = network.getOutputsInfo().begin()->first;
470
+
471
+ output_info->setPrecision(Precision::FP32);
472
+ // -----------------------------------------------------------------------------------------------------
473
+
474
+ // --------------------------- Step 4. Loading a model to the device
475
+ // ------------------------------------------
476
+ ExecutableNetwork executable_network = ie.LoadNetwork(network, device_name);
477
+ // -----------------------------------------------------------------------------------------------------
478
+
479
+ // --------------------------- Step 5. Create an infer request
480
+ // -------------------------------------------------
481
+ InferRequest infer_request = executable_network.CreateInferRequest();
482
+ // -----------------------------------------------------------------------------------------------------
483
+
484
+ // --------------------------- Step 6. Prepare input
485
+ // --------------------------------------------------------
486
+ /* Read input image to a blob and set it to an infer request without resize
487
+ * and layout conversions. */
488
+ cv::Mat image = imread_t(input_image_path);
489
+ cv::Mat pr_img = static_resize(image);
490
+ Blob::Ptr imgBlob = infer_request.GetBlob(input_name); // just wrap Mat data by Blob::Ptr
491
+ blobFromImage(pr_img, imgBlob);
492
+
493
+ // infer_request.SetBlob(input_name, imgBlob); // infer_request accepts input blob of any size
494
+ // -----------------------------------------------------------------------------------------------------
495
+
496
+ // --------------------------- Step 7. Do inference
497
+ // --------------------------------------------------------
498
+ /* Running the request synchronously */
499
+ infer_request.Infer();
500
+ // -----------------------------------------------------------------------------------------------------
501
+
502
+ // --------------------------- Step 8. Process output
503
+ // ------------------------------------------------------
504
+ const Blob::Ptr output_blob = infer_request.GetBlob(output_name);
505
+ MemoryBlob::CPtr moutput = as<MemoryBlob>(output_blob);
506
+ if (!moutput) {
507
+ throw std::logic_error("We expect output to be inherited from MemoryBlob, "
508
+ "but by fact we were not able to cast output to MemoryBlob");
509
+ }
510
+ // locked memory holder should be alive all time while access to its buffer
511
+ // happens
512
+ auto moutputHolder = moutput->rmap();
513
+ const float* net_pred = moutputHolder.as<const PrecisionTrait<Precision::FP32>::value_type*>();
514
+
515
+ int img_w = image.cols;
516
+ int img_h = image.rows;
517
+ float scale = std::min(INPUT_W / (image.cols*1.0), INPUT_H / (image.rows*1.0));
518
+ std::vector<Object> objects;
519
+
520
+ decode_outputs(net_pred, objects, scale, img_w, img_h);
521
+ draw_objects(image, objects);
522
+
523
+ // -----------------------------------------------------------------------------------------------------
524
+ } catch (const std::exception& ex) {
525
+ std::cerr << ex.what() << std::endl;
526
+ return EXIT_FAILURE;
527
+ }
528
+ return EXIT_SUCCESS;
529
+ }
multimodal/YOLOX/demo/OpenVINO/python/README.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # YOLOX-OpenVINO in Python
2
+
3
+ This tutorial includes a Python demo for OpenVINO, as well as some converted models.
4
+
5
+ ### Download OpenVINO models.
6
+
7
+ | Model | Parameters | GFLOPs | Test Size | mAP | Weights |
8
+ |:------| :----: | :----: | :---: | :---: | :---: |
9
+ | [YOLOX-Nano](../../../exps/default/nano.py) | 0.91M | 1.08 | 416x416 | 25.8 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_nano_openvino.tar.gz) |
10
+ | [YOLOX-Tiny](../../../exps/default/yolox_tiny.py) | 5.06M | 6.45 | 416x416 |32.8 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_tiny_openvino.tar.gz) |
11
+ | [YOLOX-S](../../../exps/default/yolox_s.py) | 9.0M | 26.8 | 640x640 |40.5 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s_openvino.tar.gz) |
12
+ | [YOLOX-M](../../../exps/default/yolox_m.py) | 25.3M | 73.8 | 640x640 |47.2 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_m_openvino.tar.gz) |
13
+ | [YOLOX-L](../../../exps/default/yolox_l.py) | 54.2M | 155.6 | 640x640 |50.1 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_l_openvino.tar.gz) |
14
+ | [YOLOX-Darknet53](../../../exps/default/yolov3.py) | 63.72M | 185.3 | 640x640 |48.0 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_dark_openvino.tar.gz) |
15
+ | [YOLOX-X](../../../exps/default/yolox_x.py) | 99.1M | 281.9 | 640x640 |51.5 | [github](https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_x_openvino.tar.gz) |
16
+
17
+ ## Install OpenVINO Toolkit
18
+
19
+ Please visit [Openvino Homepage](https://docs.openvinotoolkit.org/latest/get_started_guides.html) for more details.
20
+
21
+ ## Set up the Environment
22
+
23
+ ### For Linux
24
+
25
+ **Option1. Set up the environment tempororally. You need to run this command everytime you start a new shell window.**
26
+
27
+ ```shell
28
+ source /opt/intel/openvino_2021/bin/setupvars.sh
29
+ ```
30
+
31
+ **Option2. Set up the environment permenantly.**
32
+
33
+ *Step1.* For Linux:
34
+ ```shell
35
+ vim ~/.bashrc
36
+ ```
37
+
38
+ *Step2.* Add the following line into your file:
39
+
40
+ ```shell
41
+ source /opt/intel/openvino_2021/bin/setupvars.sh
42
+ ```
43
+
44
+ *Step3.* Save and exit the file, then run:
45
+
46
+ ```shell
47
+ source ~/.bashrc
48
+ ```
49
+
50
+
51
+ ## Convert model
52
+
53
+ 1. Export ONNX model
54
+
55
+ Please refer to the [ONNX tutorial](https://github.com/Megvii-BaseDetection/YOLOX/demo/ONNXRuntime). **Note that you should set --opset to 10, otherwise your next step will fail.**
56
+
57
+ 2. Convert ONNX to OpenVINO
58
+
59
+ ``` shell
60
+ cd <INSTSLL_DIR>/openvino_2021/deployment_tools/model_optimizer
61
+ ```
62
+
63
+ Install requirements for convert tool
64
+
65
+ ```shell
66
+ sudo ./install_prerequisites/install_prerequisites_onnx.sh
67
+ ```
68
+
69
+ Then convert model.
70
+ ```shell
71
+ python3 mo.py --input_model <ONNX_MODEL> --input_shape <INPUT_SHAPE> [--data_type FP16]
72
+ ```
73
+ For example:
74
+ ```shell
75
+ python3 mo.py --input_model yolox.onnx --input_shape [1,3,640,640] --data_type FP16 --output_dir converted_output
76
+ ```
77
+
78
+ ## Demo
79
+
80
+ ### python
81
+
82
+ ```shell
83
+ python openvino_inference.py -m <XML_MODEL_PATH> -i <IMAGE_PATH>
84
+ ```
85
+ or
86
+ ```shell
87
+ python openvino_inference.py -m <XML_MODEL_PATH> -i <IMAGE_PATH> -o <OUTPUT_DIR> -s <SCORE_THR> -d <DEVICE>
88
+ ```
89
+
multimodal/YOLOX/demo/OpenVINO/python/openvino_inference.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright (C) 2018-2021 Intel Corporation
4
+ # SPDX-License-Identifier: Apache-2.0
5
+ # Copyright (c) Megvii, Inc. and its affiliates.
6
+
7
+ import argparse
8
+ import logging as log
9
+ import os
10
+ import sys
11
+
12
+ import cv2
13
+ import numpy as np
14
+
15
+ from openvino.inference_engine import IECore
16
+
17
+ from yolox.data.data_augment import preproc as preprocess
18
+ from yolox.data.datasets import COCO_CLASSES
19
+ from yolox.utils import mkdir, multiclass_nms, demo_postprocess, vis
20
+
21
+
22
+ def parse_args() -> argparse.Namespace:
23
+ """Parse and return command line arguments"""
24
+ parser = argparse.ArgumentParser(add_help=False)
25
+ args = parser.add_argument_group('Options')
26
+ args.add_argument(
27
+ '-h',
28
+ '--help',
29
+ action='help',
30
+ help='Show this help message and exit.')
31
+ args.add_argument(
32
+ '-m',
33
+ '--model',
34
+ required=True,
35
+ type=str,
36
+ help='Required. Path to an .xml or .onnx file with a trained model.')
37
+ args.add_argument(
38
+ '-i',
39
+ '--input',
40
+ required=True,
41
+ type=str,
42
+ help='Required. Path to an image file.')
43
+ args.add_argument(
44
+ '-o',
45
+ '--output_dir',
46
+ type=str,
47
+ default='demo_output',
48
+ help='Path to your output dir.')
49
+ args.add_argument(
50
+ '-s',
51
+ '--score_thr',
52
+ type=float,
53
+ default=0.3,
54
+ help="Score threshould to visualize the result.")
55
+ args.add_argument(
56
+ '-d',
57
+ '--device',
58
+ default='CPU',
59
+ type=str,
60
+ help='Optional. Specify the target device to infer on; CPU, GPU, \
61
+ MYRIAD, HDDL or HETERO: is acceptable. The sample will look \
62
+ for a suitable plugin for device specified. Default value \
63
+ is CPU.')
64
+ args.add_argument(
65
+ '--labels',
66
+ default=None,
67
+ type=str,
68
+ help='Option:al. Path to a labels mapping file.')
69
+ args.add_argument(
70
+ '-nt',
71
+ '--number_top',
72
+ default=10,
73
+ type=int,
74
+ help='Optional. Number of top results.')
75
+ return parser.parse_args()
76
+
77
+
78
+ def main():
79
+ log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
80
+ args = parse_args()
81
+
82
+ # ---------------------------Step 1. Initialize inference engine core--------------------------------------------------
83
+ log.info('Creating Inference Engine')
84
+ ie = IECore()
85
+
86
+ # ---------------------------Step 2. Read a model in OpenVINO Intermediate Representation or ONNX format---------------
87
+ log.info(f'Reading the network: {args.model}')
88
+ # (.xml and .bin files) or (.onnx file)
89
+ net = ie.read_network(model=args.model)
90
+
91
+ if len(net.input_info) != 1:
92
+ log.error('Sample supports only single input topologies')
93
+ return -1
94
+ if len(net.outputs) != 1:
95
+ log.error('Sample supports only single output topologies')
96
+ return -1
97
+
98
+ # ---------------------------Step 3. Configure input & output----------------------------------------------------------
99
+ log.info('Configuring input and output blobs')
100
+ # Get names of input and output blobs
101
+ input_blob = next(iter(net.input_info))
102
+ out_blob = next(iter(net.outputs))
103
+
104
+ # Set input and output precision manually
105
+ net.input_info[input_blob].precision = 'FP32'
106
+ net.outputs[out_blob].precision = 'FP16'
107
+
108
+ # Get a number of classes recognized by a model
109
+ num_of_classes = max(net.outputs[out_blob].shape)
110
+
111
+ # ---------------------------Step 4. Loading model to the device-------------------------------------------------------
112
+ log.info('Loading the model to the plugin')
113
+ exec_net = ie.load_network(network=net, device_name=args.device)
114
+
115
+ # ---------------------------Step 5. Create infer request--------------------------------------------------------------
116
+ # load_network() method of the IECore class with a specified number of requests (default 1) returns an ExecutableNetwork
117
+ # instance which stores infer requests. So you already created Infer requests in the previous step.
118
+
119
+ # ---------------------------Step 6. Prepare input---------------------------------------------------------------------
120
+ origin_img = cv2.imread(args.input)
121
+ _, _, h, w = net.input_info[input_blob].input_data.shape
122
+ image, ratio = preprocess(origin_img, (h, w))
123
+
124
+ # ---------------------------Step 7. Do inference----------------------------------------------------------------------
125
+ log.info('Starting inference in synchronous mode')
126
+ res = exec_net.infer(inputs={input_blob: image})
127
+
128
+ # ---------------------------Step 8. Process output--------------------------------------------------------------------
129
+ res = res[out_blob]
130
+
131
+ predictions = demo_postprocess(res, (h, w))[0]
132
+
133
+ boxes = predictions[:, :4]
134
+ scores = predictions[:, 4, None] * predictions[:, 5:]
135
+
136
+ boxes_xyxy = np.ones_like(boxes)
137
+ boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
138
+ boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
139
+ boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
140
+ boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
141
+ boxes_xyxy /= ratio
142
+ dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
143
+
144
+ if dets is not None:
145
+ final_boxes = dets[:, :4]
146
+ final_scores, final_cls_inds = dets[:, 4], dets[:, 5]
147
+ origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds,
148
+ conf=args.score_thr, class_names=COCO_CLASSES)
149
+
150
+ mkdir(args.output_dir)
151
+ output_path = os.path.join(args.output_dir, os.path.basename(args.input))
152
+ cv2.imwrite(output_path, origin_img)
153
+
154
+
155
+ if __name__ == '__main__':
156
+ sys.exit(main())
multimodal/YOLOX/demo/TensorRT/cpp/CMakeLists.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 2.6)
2
+
3
+ project(yolox)
4
+
5
+ add_definitions(-std=c++11)
6
+
7
+ option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
8
+ set(CMAKE_CXX_STANDARD 11)
9
+ set(CMAKE_BUILD_TYPE Debug)
10
+
11
+ find_package(CUDA REQUIRED)
12
+
13
+ include_directories(${PROJECT_SOURCE_DIR}/include)
14
+ # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15
+ # cuda
16
+ include_directories(/data/cuda/cuda-10.2/cuda/include)
17
+ link_directories(/data/cuda/cuda-10.2/cuda/lib64)
18
+ # cudnn
19
+ include_directories(/data/cuda/cuda-10.2/cudnn/v8.0.4/include)
20
+ link_directories(/data/cuda/cuda-10.2/cudnn/v8.0.4/lib64)
21
+ # tensorrt
22
+ include_directories(/data/cuda/cuda-10.2/TensorRT/v7.2.1.6/include)
23
+ link_directories(/data/cuda/cuda-10.2/TensorRT/v7.2.1.6/lib)
24
+
25
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
26
+
27
+ find_package(OpenCV)
28
+ include_directories(${OpenCV_INCLUDE_DIRS})
29
+
30
+ add_executable(yolox ${PROJECT_SOURCE_DIR}/yolox.cpp)
31
+ target_link_libraries(yolox nvinfer)
32
+ target_link_libraries(yolox cudart)
33
+ target_link_libraries(yolox ${OpenCV_LIBS})
34
+
35
+ add_definitions(-O2 -pthread)
36
+
multimodal/YOLOX/demo/TensorRT/cpp/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # YOLOX-TensorRT in C++
2
+
3
+ As YOLOX models are easy to convert to tensorrt using [torch2trt gitrepo](https://github.com/NVIDIA-AI-IOT/torch2trt),
4
+ our C++ demo does not include the model converting or constructing like other tenorrt demos.
5
+
6
+
7
+ ## Step 1: Prepare serialized engine file
8
+
9
+ Follow the trt [python demo README](https://github.com/Megvii-BaseDetection/YOLOX/blob/main/demo/TensorRT/python/README.md) to convert and save the serialized engine file.
10
+
11
+ Check the 'model_trt.engine' file generated from Step 1, which will be automatically saved at the current demo dir.
12
+
13
+
14
+ ## Step 2: build the demo
15
+
16
+ Please follow the [TensorRT Installation Guide](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html) to install TensorRT.
17
+
18
+ And you should set the TensorRT path and CUDA path in CMakeLists.txt.
19
+
20
+ If you train your custom dataset, you may need to modify the value of `num_class`.
21
+
22
+ ```c++
23
+ const int num_class = 80;
24
+ ```
25
+
26
+ Install opencv with ```sudo apt-get install libopencv-dev``` (we don't need a higher version of opencv like v3.3+).
27
+
28
+ build the demo:
29
+
30
+ ```shell
31
+ mkdir build
32
+ cd build
33
+ cmake ..
34
+ make
35
+ ```
36
+
37
+ Then run the demo:
38
+
39
+ ```shell
40
+ ./yolox ../model_trt.engine -i ../../../../assets/dog.jpg
41
+ ```
42
+
43
+ or
44
+
45
+ ```shell
46
+ ./yolox <path/to/your/engine_file> -i <path/to/image>
47
+ ```
48
+
multimodal/YOLOX/demo/TensorRT/cpp/logging.h ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+
17
+ #ifndef TENSORRT_LOGGING_H
18
+ #define TENSORRT_LOGGING_H
19
+
20
+ #include "NvInferRuntimeCommon.h"
21
+ #include <cassert>
22
+ #include <ctime>
23
+ #include <iomanip>
24
+ #include <iostream>
25
+ #include <ostream>
26
+ #include <sstream>
27
+ #include <string>
28
+
29
+ using Severity = nvinfer1::ILogger::Severity;
30
+
31
+ class LogStreamConsumerBuffer : public std::stringbuf
32
+ {
33
+ public:
34
+ LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
35
+ : mOutput(stream)
36
+ , mPrefix(prefix)
37
+ , mShouldLog(shouldLog)
38
+ {
39
+ }
40
+
41
+ LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
42
+ : mOutput(other.mOutput)
43
+ {
44
+ }
45
+
46
+ ~LogStreamConsumerBuffer()
47
+ {
48
+ // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
49
+ // std::streambuf::pptr() gives a pointer to the current position of the output sequence
50
+ // if the pointer to the beginning is not equal to the pointer to the current position,
51
+ // call putOutput() to log the output to the stream
52
+ if (pbase() != pptr())
53
+ {
54
+ putOutput();
55
+ }
56
+ }
57
+
58
+ // synchronizes the stream buffer and returns 0 on success
59
+ // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
60
+ // resetting the buffer and flushing the stream
61
+ virtual int sync()
62
+ {
63
+ putOutput();
64
+ return 0;
65
+ }
66
+
67
+ void putOutput()
68
+ {
69
+ if (mShouldLog)
70
+ {
71
+ // prepend timestamp
72
+ std::time_t timestamp = std::time(nullptr);
73
+ tm* tm_local = std::localtime(&timestamp);
74
+ std::cout << "[";
75
+ std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
76
+ std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
77
+ std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
78
+ std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
79
+ std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
80
+ std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
81
+ // std::stringbuf::str() gets the string contents of the buffer
82
+ // insert the buffer contents pre-appended by the appropriate prefix into the stream
83
+ mOutput << mPrefix << str();
84
+ // set the buffer to empty
85
+ str("");
86
+ // flush the stream
87
+ mOutput.flush();
88
+ }
89
+ }
90
+
91
+ void setShouldLog(bool shouldLog)
92
+ {
93
+ mShouldLog = shouldLog;
94
+ }
95
+
96
+ private:
97
+ std::ostream& mOutput;
98
+ std::string mPrefix;
99
+ bool mShouldLog;
100
+ };
101
+
102
+ //!
103
+ //! \class LogStreamConsumerBase
104
+ //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
105
+ //!
106
+ class LogStreamConsumerBase
107
+ {
108
+ public:
109
+ LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
110
+ : mBuffer(stream, prefix, shouldLog)
111
+ {
112
+ }
113
+
114
+ protected:
115
+ LogStreamConsumerBuffer mBuffer;
116
+ };
117
+
118
+ //!
119
+ //! \class LogStreamConsumer
120
+ //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
121
+ //! Order of base classes is LogStreamConsumerBase and then std::ostream.
122
+ //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
123
+ //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
124
+ //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
125
+ //! Please do not change the order of the parent classes.
126
+ //!
127
+ class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
128
+ {
129
+ public:
130
+ //! \brief Creates a LogStreamConsumer which logs messages with level severity.
131
+ //! Reportable severity determines if the messages are severe enough to be logged.
132
+ LogStreamConsumer(Severity reportableSeverity, Severity severity)
133
+ : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
134
+ , std::ostream(&mBuffer) // links the stream buffer with the stream
135
+ , mShouldLog(severity <= reportableSeverity)
136
+ , mSeverity(severity)
137
+ {
138
+ }
139
+
140
+ LogStreamConsumer(LogStreamConsumer&& other)
141
+ : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
142
+ , std::ostream(&mBuffer) // links the stream buffer with the stream
143
+ , mShouldLog(other.mShouldLog)
144
+ , mSeverity(other.mSeverity)
145
+ {
146
+ }
147
+
148
+ void setReportableSeverity(Severity reportableSeverity)
149
+ {
150
+ mShouldLog = mSeverity <= reportableSeverity;
151
+ mBuffer.setShouldLog(mShouldLog);
152
+ }
153
+
154
+ private:
155
+ static std::ostream& severityOstream(Severity severity)
156
+ {
157
+ return severity >= Severity::kINFO ? std::cout : std::cerr;
158
+ }
159
+
160
+ static std::string severityPrefix(Severity severity)
161
+ {
162
+ switch (severity)
163
+ {
164
+ case Severity::kINTERNAL_ERROR: return "[F] ";
165
+ case Severity::kERROR: return "[E] ";
166
+ case Severity::kWARNING: return "[W] ";
167
+ case Severity::kINFO: return "[I] ";
168
+ case Severity::kVERBOSE: return "[V] ";
169
+ default: assert(0); return "";
170
+ }
171
+ }
172
+
173
+ bool mShouldLog;
174
+ Severity mSeverity;
175
+ };
176
+
177
+ //! \class Logger
178
+ //!
179
+ //! \brief Class which manages logging of TensorRT tools and samples
180
+ //!
181
+ //! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
182
+ //! and supports logging two types of messages:
183
+ //!
184
+ //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
185
+ //! - Test pass/fail messages
186
+ //!
187
+ //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
188
+ //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
189
+ //!
190
+ //! In the future, this class could be extended to support dumping test results to a file in some standard format
191
+ //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
192
+ //!
193
+ //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
194
+ //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
195
+ //! library and messages coming from the sample.
196
+ //!
197
+ //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
198
+ //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
199
+ //! object.
200
+
201
+ class Logger : public nvinfer1::ILogger
202
+ {
203
+ public:
204
+ Logger(Severity severity = Severity::kWARNING)
205
+ : mReportableSeverity(severity)
206
+ {
207
+ }
208
+
209
+ //!
210
+ //! \enum TestResult
211
+ //! \brief Represents the state of a given test
212
+ //!
213
+ enum class TestResult
214
+ {
215
+ kRUNNING, //!< The test is running
216
+ kPASSED, //!< The test passed
217
+ kFAILED, //!< The test failed
218
+ kWAIVED //!< The test was waived
219
+ };
220
+
221
+ //!
222
+ //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
223
+ //! \return The nvinfer1::ILogger associated with this Logger
224
+ //!
225
+ //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
226
+ //! we can eliminate the inheritance of Logger from ILogger
227
+ //!
228
+ nvinfer1::ILogger& getTRTLogger()
229
+ {
230
+ return *this;
231
+ }
232
+
233
+ //!
234
+ //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
235
+ //!
236
+ //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
237
+ //! inheritance from nvinfer1::ILogger
238
+ //!
239
+ void log(Severity severity, const char* msg) noexcept override
240
+ {
241
+ LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
242
+ }
243
+
244
+ //!
245
+ //! \brief Method for controlling the verbosity of logging output
246
+ //!
247
+ //! \param severity The logger will only emit messages that have severity of this level or higher.
248
+ //!
249
+ void setReportableSeverity(Severity severity)
250
+ {
251
+ mReportableSeverity = severity;
252
+ }
253
+
254
+ //!
255
+ //! \brief Opaque handle that holds logging information for a particular test
256
+ //!
257
+ //! This object is an opaque handle to information used by the Logger to print test results.
258
+ //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
259
+ //! with Logger::reportTest{Start,End}().
260
+ //!
261
+ class TestAtom
262
+ {
263
+ public:
264
+ TestAtom(TestAtom&&) = default;
265
+
266
+ private:
267
+ friend class Logger;
268
+
269
+ TestAtom(bool started, const std::string& name, const std::string& cmdline)
270
+ : mStarted(started)
271
+ , mName(name)
272
+ , mCmdline(cmdline)
273
+ {
274
+ }
275
+
276
+ bool mStarted;
277
+ std::string mName;
278
+ std::string mCmdline;
279
+ };
280
+
281
+ //!
282
+ //! \brief Define a test for logging
283
+ //!
284
+ //! \param[in] name The name of the test. This should be a string starting with
285
+ //! "TensorRT" and containing dot-separated strings containing
286
+ //! the characters [A-Za-z0-9_].
287
+ //! For example, "TensorRT.sample_googlenet"
288
+ //! \param[in] cmdline The command line used to reproduce the test
289
+ //
290
+ //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
291
+ //!
292
+ static TestAtom defineTest(const std::string& name, const std::string& cmdline)
293
+ {
294
+ return TestAtom(false, name, cmdline);
295
+ }
296
+
297
+ //!
298
+ //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
299
+ //! as input
300
+ //!
301
+ //! \param[in] name The name of the test
302
+ //! \param[in] argc The number of command-line arguments
303
+ //! \param[in] argv The array of command-line arguments (given as C strings)
304
+ //!
305
+ //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
306
+ static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
307
+ {
308
+ auto cmdline = genCmdlineString(argc, argv);
309
+ return defineTest(name, cmdline);
310
+ }
311
+
312
+ //!
313
+ //! \brief Report that a test has started.
314
+ //!
315
+ //! \pre reportTestStart() has not been called yet for the given testAtom
316
+ //!
317
+ //! \param[in] testAtom The handle to the test that has started
318
+ //!
319
+ static void reportTestStart(TestAtom& testAtom)
320
+ {
321
+ reportTestResult(testAtom, TestResult::kRUNNING);
322
+ assert(!testAtom.mStarted);
323
+ testAtom.mStarted = true;
324
+ }
325
+
326
+ //!
327
+ //! \brief Report that a test has ended.
328
+ //!
329
+ //! \pre reportTestStart() has been called for the given testAtom
330
+ //!
331
+ //! \param[in] testAtom The handle to the test that has ended
332
+ //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
333
+ //! TestResult::kFAILED, TestResult::kWAIVED
334
+ //!
335
+ static void reportTestEnd(const TestAtom& testAtom, TestResult result)
336
+ {
337
+ assert(result != TestResult::kRUNNING);
338
+ assert(testAtom.mStarted);
339
+ reportTestResult(testAtom, result);
340
+ }
341
+
342
+ static int reportPass(const TestAtom& testAtom)
343
+ {
344
+ reportTestEnd(testAtom, TestResult::kPASSED);
345
+ return EXIT_SUCCESS;
346
+ }
347
+
348
+ static int reportFail(const TestAtom& testAtom)
349
+ {
350
+ reportTestEnd(testAtom, TestResult::kFAILED);
351
+ return EXIT_FAILURE;
352
+ }
353
+
354
+ static int reportWaive(const TestAtom& testAtom)
355
+ {
356
+ reportTestEnd(testAtom, TestResult::kWAIVED);
357
+ return EXIT_SUCCESS;
358
+ }
359
+
360
+ static int reportTest(const TestAtom& testAtom, bool pass)
361
+ {
362
+ return pass ? reportPass(testAtom) : reportFail(testAtom);
363
+ }
364
+
365
+ Severity getReportableSeverity() const
366
+ {
367
+ return mReportableSeverity;
368
+ }
369
+
370
+ private:
371
+ //!
372
+ //! \brief returns an appropriate string for prefixing a log message with the given severity
373
+ //!
374
+ static const char* severityPrefix(Severity severity)
375
+ {
376
+ switch (severity)
377
+ {
378
+ case Severity::kINTERNAL_ERROR: return "[F] ";
379
+ case Severity::kERROR: return "[E] ";
380
+ case Severity::kWARNING: return "[W] ";
381
+ case Severity::kINFO: return "[I] ";
382
+ case Severity::kVERBOSE: return "[V] ";
383
+ default: assert(0); return "";
384
+ }
385
+ }
386
+
387
+ //!
388
+ //! \brief returns an appropriate string for prefixing a test result message with the given result
389
+ //!
390
+ static const char* testResultString(TestResult result)
391
+ {
392
+ switch (result)
393
+ {
394
+ case TestResult::kRUNNING: return "RUNNING";
395
+ case TestResult::kPASSED: return "PASSED";
396
+ case TestResult::kFAILED: return "FAILED";
397
+ case TestResult::kWAIVED: return "WAIVED";
398
+ default: assert(0); return "";
399
+ }
400
+ }
401
+
402
+ //!
403
+ //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
404
+ //!
405
+ static std::ostream& severityOstream(Severity severity)
406
+ {
407
+ return severity >= Severity::kINFO ? std::cout : std::cerr;
408
+ }
409
+
410
+ //!
411
+ //! \brief method that implements logging test results
412
+ //!
413
+ static void reportTestResult(const TestAtom& testAtom, TestResult result)
414
+ {
415
+ severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
416
+ << testAtom.mCmdline << std::endl;
417
+ }
418
+
419
+ //!
420
+ //! \brief generate a command line string from the given (argc, argv) values
421
+ //!
422
+ static std::string genCmdlineString(int argc, char const* const* argv)
423
+ {
424
+ std::stringstream ss;
425
+ for (int i = 0; i < argc; i++)
426
+ {
427
+ if (i > 0)
428
+ ss << " ";
429
+ ss << argv[i];
430
+ }
431
+ return ss.str();
432
+ }
433
+
434
+ Severity mReportableSeverity;
435
+ };
436
+
437
+ namespace
438
+ {
439
+
440
+ //!
441
+ //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
442
+ //!
443
+ //! Example usage:
444
+ //!
445
+ //! LOG_VERBOSE(logger) << "hello world" << std::endl;
446
+ //!
447
+ inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
448
+ {
449
+ return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
450
+ }
451
+
452
+ //!
453
+ //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
454
+ //!
455
+ //! Example usage:
456
+ //!
457
+ //! LOG_INFO(logger) << "hello world" << std::endl;
458
+ //!
459
+ inline LogStreamConsumer LOG_INFO(const Logger& logger)
460
+ {
461
+ return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
462
+ }
463
+
464
+ //!
465
+ //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
466
+ //!
467
+ //! Example usage:
468
+ //!
469
+ //! LOG_WARN(logger) << "hello world" << std::endl;
470
+ //!
471
+ inline LogStreamConsumer LOG_WARN(const Logger& logger)
472
+ {
473
+ return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
474
+ }
475
+
476
+ //!
477
+ //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
478
+ //!
479
+ //! Example usage:
480
+ //!
481
+ //! LOG_ERROR(logger) << "hello world" << std::endl;
482
+ //!
483
+ inline LogStreamConsumer LOG_ERROR(const Logger& logger)
484
+ {
485
+ return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
486
+ }
487
+
488
+ //!
489
+ //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
490
+ // ("fatal" severity)
491
+ //!
492
+ //! Example usage:
493
+ //!
494
+ //! LOG_FATAL(logger) << "hello world" << std::endl;
495
+ //!
496
+ inline LogStreamConsumer LOG_FATAL(const Logger& logger)
497
+ {
498
+ return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
499
+ }
500
+
501
+ } // anonymous namespace
502
+
503
+ #endif // TENSORRT_LOGGING_H
multimodal/YOLOX/demo/TensorRT/cpp/yolox.cpp ADDED
@@ -0,0 +1,530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <fstream>
2
+ #include <iostream>
3
+ #include <sstream>
4
+ #include <numeric>
5
+ #include <chrono>
6
+ #include <vector>
7
+ #include <opencv2/opencv.hpp>
8
+ #include <dirent.h>
9
+ #include "NvInfer.h"
10
+ #include "cuda_runtime_api.h"
11
+ #include "logging.h"
12
+
13
+ #define CHECK(status) \
14
+ do\
15
+ {\
16
+ auto ret = (status);\
17
+ if (ret != 0)\
18
+ {\
19
+ std::cerr << "Cuda failure: " << ret << std::endl;\
20
+ abort();\
21
+ }\
22
+ } while (0)
23
+
24
+ #define DEVICE 0 // GPU id
25
+ #define NMS_THRESH 0.45
26
+ #define BBOX_CONF_THRESH 0.3
27
+
28
+ using namespace nvinfer1;
29
+
30
+ // stuff we know about the network and the input/output blobs
31
+ static const int INPUT_W = 640;
32
+ static const int INPUT_H = 640;
33
+ static const int NUM_CLASSES = 80;
34
+ const char* INPUT_BLOB_NAME = "input_0";
35
+ const char* OUTPUT_BLOB_NAME = "output_0";
36
+ static Logger gLogger;
37
+
38
+ cv::Mat static_resize(cv::Mat& img) {
39
+ float r = std::min(INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0));
40
+ // r = std::min(r, 1.0f);
41
+ int unpad_w = r * img.cols;
42
+ int unpad_h = r * img.rows;
43
+ cv::Mat re(unpad_h, unpad_w, CV_8UC3);
44
+ cv::resize(img, re, re.size());
45
+ cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(114, 114, 114));
46
+ re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
47
+ return out;
48
+ }
49
+
50
+ struct Object
51
+ {
52
+ cv::Rect_<float> rect;
53
+ int label;
54
+ float prob;
55
+ };
56
+
57
+ struct GridAndStride
58
+ {
59
+ int grid0;
60
+ int grid1;
61
+ int stride;
62
+ };
63
+
64
+ static void generate_grids_and_stride(std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
65
+ {
66
+ for (auto stride : strides)
67
+ {
68
+ int num_grid_y = INPUT_H / stride;
69
+ int num_grid_x = INPUT_W / stride;
70
+ for (int g1 = 0; g1 < num_grid_y; g1++)
71
+ {
72
+ for (int g0 = 0; g0 < num_grid_x; g0++)
73
+ {
74
+ grid_strides.push_back((GridAndStride){g0, g1, stride});
75
+ }
76
+ }
77
+ }
78
+ }
79
+
80
+ static inline float intersection_area(const Object& a, const Object& b)
81
+ {
82
+ cv::Rect_<float> inter = a.rect & b.rect;
83
+ return inter.area();
84
+ }
85
+
86
+ static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
87
+ {
88
+ int i = left;
89
+ int j = right;
90
+ float p = faceobjects[(left + right) / 2].prob;
91
+
92
+ while (i <= j)
93
+ {
94
+ while (faceobjects[i].prob > p)
95
+ i++;
96
+
97
+ while (faceobjects[j].prob < p)
98
+ j--;
99
+
100
+ if (i <= j)
101
+ {
102
+ // swap
103
+ std::swap(faceobjects[i], faceobjects[j]);
104
+
105
+ i++;
106
+ j--;
107
+ }
108
+ }
109
+
110
+ #pragma omp parallel sections
111
+ {
112
+ #pragma omp section
113
+ {
114
+ if (left < j) qsort_descent_inplace(faceobjects, left, j);
115
+ }
116
+ #pragma omp section
117
+ {
118
+ if (i < right) qsort_descent_inplace(faceobjects, i, right);
119
+ }
120
+ }
121
+ }
122
+
123
+ static void qsort_descent_inplace(std::vector<Object>& objects)
124
+ {
125
+ if (objects.empty())
126
+ return;
127
+
128
+ qsort_descent_inplace(objects, 0, objects.size() - 1);
129
+ }
130
+
131
+ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
132
+ {
133
+ picked.clear();
134
+
135
+ const int n = faceobjects.size();
136
+
137
+ std::vector<float> areas(n);
138
+ for (int i = 0; i < n; i++)
139
+ {
140
+ areas[i] = faceobjects[i].rect.area();
141
+ }
142
+
143
+ for (int i = 0; i < n; i++)
144
+ {
145
+ const Object& a = faceobjects[i];
146
+
147
+ int keep = 1;
148
+ for (int j = 0; j < (int)picked.size(); j++)
149
+ {
150
+ const Object& b = faceobjects[picked[j]];
151
+
152
+ // intersection over union
153
+ float inter_area = intersection_area(a, b);
154
+ float union_area = areas[i] + areas[picked[j]] - inter_area;
155
+ // float IoU = inter_area / union_area
156
+ if (inter_area / union_area > nms_threshold)
157
+ keep = 0;
158
+ }
159
+
160
+ if (keep)
161
+ picked.push_back(i);
162
+ }
163
+ }
164
+
165
+
166
+ static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, float* feat_blob, float prob_threshold, std::vector<Object>& objects)
167
+ {
168
+
169
+ const int num_anchors = grid_strides.size();
170
+
171
+ for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
172
+ {
173
+ const int grid0 = grid_strides[anchor_idx].grid0;
174
+ const int grid1 = grid_strides[anchor_idx].grid1;
175
+ const int stride = grid_strides[anchor_idx].stride;
176
+
177
+ const int basic_pos = anchor_idx * (NUM_CLASSES + 5);
178
+
179
+ // yolox/models/yolo_head.py decode logic
180
+ float x_center = (feat_blob[basic_pos+0] + grid0) * stride;
181
+ float y_center = (feat_blob[basic_pos+1] + grid1) * stride;
182
+ float w = exp(feat_blob[basic_pos+2]) * stride;
183
+ float h = exp(feat_blob[basic_pos+3]) * stride;
184
+ float x0 = x_center - w * 0.5f;
185
+ float y0 = y_center - h * 0.5f;
186
+
187
+ float box_objectness = feat_blob[basic_pos+4];
188
+ for (int class_idx = 0; class_idx < NUM_CLASSES; class_idx++)
189
+ {
190
+ float box_cls_score = feat_blob[basic_pos + 5 + class_idx];
191
+ float box_prob = box_objectness * box_cls_score;
192
+ if (box_prob > prob_threshold)
193
+ {
194
+ Object obj;
195
+ obj.rect.x = x0;
196
+ obj.rect.y = y0;
197
+ obj.rect.width = w;
198
+ obj.rect.height = h;
199
+ obj.label = class_idx;
200
+ obj.prob = box_prob;
201
+
202
+ objects.push_back(obj);
203
+ }
204
+
205
+ } // class loop
206
+
207
+ } // point anchor loop
208
+ }
209
+
210
+ float* blobFromImage(cv::Mat& img){
211
+ float* blob = new float[img.total()*3];
212
+ int channels = 3;
213
+ int img_h = img.rows;
214
+ int img_w = img.cols;
215
+ for (size_t c = 0; c < channels; c++)
216
+ {
217
+ for (size_t h = 0; h < img_h; h++)
218
+ {
219
+ for (size_t w = 0; w < img_w; w++)
220
+ {
221
+ blob[c * img_w * img_h + h * img_w + w] =
222
+ (float)img.at<cv::Vec3b>(h, w)[c];
223
+ }
224
+ }
225
+ }
226
+ return blob;
227
+ }
228
+
229
+
230
+ static void decode_outputs(float* prob, std::vector<Object>& objects, float scale, const int img_w, const int img_h) {
231
+ std::vector<Object> proposals;
232
+ std::vector<int> strides = {8, 16, 32};
233
+ std::vector<GridAndStride> grid_strides;
234
+ generate_grids_and_stride(strides, grid_strides);
235
+ generate_yolox_proposals(grid_strides, prob, BBOX_CONF_THRESH, proposals);
236
+ std::cout << "num of boxes before nms: " << proposals.size() << std::endl;
237
+
238
+ qsort_descent_inplace(proposals);
239
+
240
+ std::vector<int> picked;
241
+ nms_sorted_bboxes(proposals, picked, NMS_THRESH);
242
+
243
+
244
+ int count = picked.size();
245
+
246
+ std::cout << "num of boxes: " << count << std::endl;
247
+
248
+ objects.resize(count);
249
+ for (int i = 0; i < count; i++)
250
+ {
251
+ objects[i] = proposals[picked[i]];
252
+
253
+ // adjust offset to original unpadded
254
+ float x0 = (objects[i].rect.x) / scale;
255
+ float y0 = (objects[i].rect.y) / scale;
256
+ float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
257
+ float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
258
+
259
+ // clip
260
+ x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
261
+ y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
262
+ x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
263
+ y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
264
+
265
+ objects[i].rect.x = x0;
266
+ objects[i].rect.y = y0;
267
+ objects[i].rect.width = x1 - x0;
268
+ objects[i].rect.height = y1 - y0;
269
+ }
270
+ }
271
+
272
+ const float color_list[80][3] =
273
+ {
274
+ {0.000, 0.447, 0.741},
275
+ {0.850, 0.325, 0.098},
276
+ {0.929, 0.694, 0.125},
277
+ {0.494, 0.184, 0.556},
278
+ {0.466, 0.674, 0.188},
279
+ {0.301, 0.745, 0.933},
280
+ {0.635, 0.078, 0.184},
281
+ {0.300, 0.300, 0.300},
282
+ {0.600, 0.600, 0.600},
283
+ {1.000, 0.000, 0.000},
284
+ {1.000, 0.500, 0.000},
285
+ {0.749, 0.749, 0.000},
286
+ {0.000, 1.000, 0.000},
287
+ {0.000, 0.000, 1.000},
288
+ {0.667, 0.000, 1.000},
289
+ {0.333, 0.333, 0.000},
290
+ {0.333, 0.667, 0.000},
291
+ {0.333, 1.000, 0.000},
292
+ {0.667, 0.333, 0.000},
293
+ {0.667, 0.667, 0.000},
294
+ {0.667, 1.000, 0.000},
295
+ {1.000, 0.333, 0.000},
296
+ {1.000, 0.667, 0.000},
297
+ {1.000, 1.000, 0.000},
298
+ {0.000, 0.333, 0.500},
299
+ {0.000, 0.667, 0.500},
300
+ {0.000, 1.000, 0.500},
301
+ {0.333, 0.000, 0.500},
302
+ {0.333, 0.333, 0.500},
303
+ {0.333, 0.667, 0.500},
304
+ {0.333, 1.000, 0.500},
305
+ {0.667, 0.000, 0.500},
306
+ {0.667, 0.333, 0.500},
307
+ {0.667, 0.667, 0.500},
308
+ {0.667, 1.000, 0.500},
309
+ {1.000, 0.000, 0.500},
310
+ {1.000, 0.333, 0.500},
311
+ {1.000, 0.667, 0.500},
312
+ {1.000, 1.000, 0.500},
313
+ {0.000, 0.333, 1.000},
314
+ {0.000, 0.667, 1.000},
315
+ {0.000, 1.000, 1.000},
316
+ {0.333, 0.000, 1.000},
317
+ {0.333, 0.333, 1.000},
318
+ {0.333, 0.667, 1.000},
319
+ {0.333, 1.000, 1.000},
320
+ {0.667, 0.000, 1.000},
321
+ {0.667, 0.333, 1.000},
322
+ {0.667, 0.667, 1.000},
323
+ {0.667, 1.000, 1.000},
324
+ {1.000, 0.000, 1.000},
325
+ {1.000, 0.333, 1.000},
326
+ {1.000, 0.667, 1.000},
327
+ {0.333, 0.000, 0.000},
328
+ {0.500, 0.000, 0.000},
329
+ {0.667, 0.000, 0.000},
330
+ {0.833, 0.000, 0.000},
331
+ {1.000, 0.000, 0.000},
332
+ {0.000, 0.167, 0.000},
333
+ {0.000, 0.333, 0.000},
334
+ {0.000, 0.500, 0.000},
335
+ {0.000, 0.667, 0.000},
336
+ {0.000, 0.833, 0.000},
337
+ {0.000, 1.000, 0.000},
338
+ {0.000, 0.000, 0.167},
339
+ {0.000, 0.000, 0.333},
340
+ {0.000, 0.000, 0.500},
341
+ {0.000, 0.000, 0.667},
342
+ {0.000, 0.000, 0.833},
343
+ {0.000, 0.000, 1.000},
344
+ {0.000, 0.000, 0.000},
345
+ {0.143, 0.143, 0.143},
346
+ {0.286, 0.286, 0.286},
347
+ {0.429, 0.429, 0.429},
348
+ {0.571, 0.571, 0.571},
349
+ {0.714, 0.714, 0.714},
350
+ {0.857, 0.857, 0.857},
351
+ {0.000, 0.447, 0.741},
352
+ {0.314, 0.717, 0.741},
353
+ {0.50, 0.5, 0}
354
+ };
355
+
356
+ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects, std::string f)
357
+ {
358
+ static const char* class_names[] = {
359
+ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
360
+ "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
361
+ "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
362
+ "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
363
+ "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
364
+ "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
365
+ "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
366
+ "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
367
+ "hair drier", "toothbrush"
368
+ };
369
+
370
+ cv::Mat image = bgr.clone();
371
+
372
+ for (size_t i = 0; i < objects.size(); i++)
373
+ {
374
+ const Object& obj = objects[i];
375
+
376
+ fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
377
+ obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
378
+
379
+ cv::Scalar color = cv::Scalar(color_list[obj.label][0], color_list[obj.label][1], color_list[obj.label][2]);
380
+ float c_mean = cv::mean(color)[0];
381
+ cv::Scalar txt_color;
382
+ if (c_mean > 0.5){
383
+ txt_color = cv::Scalar(0, 0, 0);
384
+ }else{
385
+ txt_color = cv::Scalar(255, 255, 255);
386
+ }
387
+
388
+ cv::rectangle(image, obj.rect, color * 255, 2);
389
+
390
+ char text[256];
391
+ sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
392
+
393
+ int baseLine = 0;
394
+ cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
395
+
396
+ cv::Scalar txt_bk_color = color * 0.7 * 255;
397
+
398
+ int x = obj.rect.x;
399
+ int y = obj.rect.y + 1;
400
+ //int y = obj.rect.y - label_size.height - baseLine;
401
+ if (y > image.rows)
402
+ y = image.rows;
403
+ //if (x + label_size.width > image.cols)
404
+ //x = image.cols - label_size.width;
405
+
406
+ cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
407
+ txt_bk_color, -1);
408
+
409
+ cv::putText(image, text, cv::Point(x, y + label_size.height),
410
+ cv::FONT_HERSHEY_SIMPLEX, 0.4, txt_color, 1);
411
+ }
412
+
413
+ cv::imwrite("det_res.jpg", image);
414
+ fprintf(stderr, "save vis file\n");
415
+ /* cv::imshow("image", image); */
416
+ /* cv::waitKey(0); */
417
+ }
418
+
419
+
420
+ void doInference(IExecutionContext& context, float* input, float* output, const int output_size, cv::Size input_shape) {
421
+ const ICudaEngine& engine = context.getEngine();
422
+
423
+ // Pointers to input and output device buffers to pass to engine.
424
+ // Engine requires exactly IEngine::getNbBindings() number of buffers.
425
+ assert(engine.getNbBindings() == 2);
426
+ void* buffers[2];
427
+
428
+ // In order to bind the buffers, we need to know the names of the input and output tensors.
429
+ // Note that indices are guaranteed to be less than IEngine::getNbBindings()
430
+ const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
431
+
432
+ assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT);
433
+ const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
434
+ assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT);
435
+ int mBatchSize = engine.getMaxBatchSize();
436
+
437
+ // Create GPU buffers on device
438
+ CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float)));
439
+ CHECK(cudaMalloc(&buffers[outputIndex], output_size*sizeof(float)));
440
+
441
+ // Create stream
442
+ cudaStream_t stream;
443
+ CHECK(cudaStreamCreate(&stream));
444
+
445
+ // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
446
+ CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
447
+ context.enqueue(1, buffers, stream, nullptr);
448
+ CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
449
+ cudaStreamSynchronize(stream);
450
+
451
+ // Release stream and buffers
452
+ cudaStreamDestroy(stream);
453
+ CHECK(cudaFree(buffers[inputIndex]));
454
+ CHECK(cudaFree(buffers[outputIndex]));
455
+ }
456
+
457
+ int main(int argc, char** argv) {
458
+ cudaSetDevice(DEVICE);
459
+ // create a model using the API directly and serialize it to a stream
460
+ char *trtModelStream{nullptr};
461
+ size_t size{0};
462
+
463
+ if (argc == 4 && std::string(argv[2]) == "-i") {
464
+ const std::string engine_file_path {argv[1]};
465
+ std::ifstream file(engine_file_path, std::ios::binary);
466
+ if (file.good()) {
467
+ file.seekg(0, file.end);
468
+ size = file.tellg();
469
+ file.seekg(0, file.beg);
470
+ trtModelStream = new char[size];
471
+ assert(trtModelStream);
472
+ file.read(trtModelStream, size);
473
+ file.close();
474
+ }
475
+ } else {
476
+ std::cerr << "arguments not right!" << std::endl;
477
+ std::cerr << "run 'python3 yolox/deploy/trt.py -n yolox-{tiny, s, m, l, x}' to serialize model first!" << std::endl;
478
+ std::cerr << "Then use the following command:" << std::endl;
479
+ std::cerr << "./yolox ../model_trt.engine -i ../../../assets/dog.jpg // deserialize file and run inference" << std::endl;
480
+ return -1;
481
+ }
482
+ const std::string input_image_path {argv[3]};
483
+
484
+ //std::vector<std::string> file_names;
485
+ //if (read_files_in_dir(argv[2], file_names) < 0) {
486
+ //std::cout << "read_files_in_dir failed." << std::endl;
487
+ //return -1;
488
+ //}
489
+
490
+ IRuntime* runtime = createInferRuntime(gLogger);
491
+ assert(runtime != nullptr);
492
+ ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
493
+ assert(engine != nullptr);
494
+ IExecutionContext* context = engine->createExecutionContext();
495
+ assert(context != nullptr);
496
+ delete[] trtModelStream;
497
+ auto out_dims = engine->getBindingDimensions(1);
498
+ auto output_size = 1;
499
+ for(int j=0;j<out_dims.nbDims;j++) {
500
+ output_size *= out_dims.d[j];
501
+ }
502
+ static float* prob = new float[output_size];
503
+
504
+ cv::Mat img = cv::imread(input_image_path);
505
+ int img_w = img.cols;
506
+ int img_h = img.rows;
507
+ cv::Mat pr_img = static_resize(img);
508
+ std::cout << "blob image" << std::endl;
509
+
510
+ float* blob;
511
+ blob = blobFromImage(pr_img);
512
+ float scale = std::min(INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0));
513
+
514
+ // run inference
515
+ auto start = std::chrono::system_clock::now();
516
+ doInference(*context, blob, prob, output_size, pr_img.size());
517
+ auto end = std::chrono::system_clock::now();
518
+ std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
519
+
520
+ std::vector<Object> objects;
521
+ decode_outputs(prob, objects, scale, img_w, img_h);
522
+ draw_objects(img, objects, input_image_path);
523
+ // delete the pointer to the float
524
+ delete blob;
525
+ // destroy the engine
526
+ context->destroy();
527
+ engine->destroy();
528
+ runtime->destroy();
529
+ return 0;
530
+ }
multimodal/YOLOX/demo/TensorRT/python/README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # YOLOX-TensorRT in Python
2
+
3
+ This tutorial includes a Python demo for TensorRT.
4
+
5
+ ## Install TensorRT Toolkit
6
+
7
+ Please follow the [TensorRT Installation Guide](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html) and [torch2trt gitrepo](https://github.com/NVIDIA-AI-IOT/torch2trt) to install TensorRT and torch2trt.
8
+
9
+ ## Convert model
10
+
11
+ YOLOX models can be easily conveted to TensorRT models using torch2trt
12
+
13
+ If you want to convert our model, use the flag -n to specify a model name:
14
+ ```shell
15
+ python tools/trt.py -n <YOLOX_MODEL_NAME> -c <YOLOX_CHECKPOINT>
16
+ ```
17
+ For example:
18
+ ```shell
19
+ python tools/trt.py -n yolox-s -c your_ckpt.pth
20
+ ```
21
+ <YOLOX_MODEL_NAME> can be: yolox-nano, yolox-tiny. yolox-s, yolox-m, yolox-l, yolox-x.
22
+
23
+ If you want to convert your customized model, use the flag -f to specify you exp file:
24
+ ```shell
25
+ python tools/trt.py -f <YOLOX_EXP_FILE> -c <YOLOX_CHECKPOINT>
26
+ ```
27
+ For example:
28
+ ```shell
29
+ python tools/trt.py -f /path/to/your/yolox/exps/yolox_s.py -c your_ckpt.pth
30
+ ```
31
+ *yolox_s.py* can be any exp file modified by you.
32
+
33
+ The converted model and the serialized engine file (for C++ demo) will be saved on your experiment output dir.
34
+
35
+ ## Demo
36
+
37
+ The TensorRT python demo is merged on our pytorch demo file, so you can run the pytorch demo command with ```--trt```.
38
+
39
+ ```shell
40
+ python tools/demo.py image -n yolox-s --trt --save_result
41
+ ```
42
+ or
43
+ ```shell
44
+ python tools/demo.py image -f exps/default/yolox_s.py --trt --save_result
45
+ ```
46
+
multimodal/YOLOX/demo/ncnn/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # YOLOX-ncnn
2
+
3
+ Compile files of YOLOX object detection base on [ncnn](https://github.com/Tencent/ncnn).
4
+ YOLOX is included in ncnn now, you could also try building from ncnn, it's better.
5
+
6
+ ## Acknowledgement
7
+
8
+ * [ncnn](https://github.com/Tencent/ncnn)
multimodal/YOLOX/demo/ncnn/android/README.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # YOLOX-Android-ncnn
2
+
3
+ Andoird app of YOLOX object detection base on [ncnn](https://github.com/Tencent/ncnn)
4
+
5
+
6
+ ## Tutorial
7
+
8
+ ### Step1
9
+
10
+ Download ncnn-android-vulkan.zip from [releases of ncnn](https://github.com/Tencent/ncnn/releases). This repo uses
11
+ [20210525 release](https://github.com/Tencent/ncnn/releases/download/20210525/ncnn-20210525-android-vulkan.zip) for building.
12
+
13
+ ### Step2
14
+
15
+ After downloading, please extract your zip file. Then, there are two ways to finish this step:
16
+ * put your extracted directory into **app/src/main/jni**
17
+ * change the **ncnn_DIR** path in **app/src/main/jni/CMakeLists.txt** to your extracted directory
18
+
19
+ ### Step3
20
+ Download example param and bin file from [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ESXBH_GSSmFMszWJ6YG2VkQB5cWDfqVWXgk0D996jH0rpQ?e=qzEqUh) or [github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s_ncnn.tar.gz). Unzip the file to **app/src/main/assets**.
21
+
22
+ ### Step4
23
+ Open this project with Android Studio, build it and enjoy!
24
+
25
+ ## Reference
26
+
27
+ * [ncnn-android-yolov5](https://github.com/nihui/ncnn-android-yolov5)
multimodal/YOLOX/demo/ncnn/android/app/build.gradle ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apply plugin: 'com.android.application'
2
+
3
+ android {
4
+ compileSdkVersion 24
5
+ buildToolsVersion "29.0.2"
6
+
7
+ defaultConfig {
8
+ applicationId "com.megvii.yoloXncnn"
9
+ archivesBaseName = "$applicationId"
10
+
11
+ ndk {
12
+ moduleName "ncnn"
13
+ abiFilters "armeabi-v7a", "arm64-v8a"
14
+ }
15
+ minSdkVersion 24
16
+ }
17
+
18
+ externalNativeBuild {
19
+ cmake {
20
+ version "3.10.2"
21
+ path file('src/main/jni/CMakeLists.txt')
22
+ }
23
+ }
24
+ }
multimodal/YOLOX/demo/ncnn/android/app/src/main/AndroidManifest.xml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <manifest xmlns:android="http://schemas.android.com/apk/res/android"
3
+ package="com.megvii.yoloXncnn"
4
+ android:versionCode="1"
5
+ android:versionName="1.1">
6
+ <application android:label="@string/app_name" >
7
+ <activity android:name="MainActivity"
8
+ android:label="@string/app_name">
9
+ <intent-filter>
10
+ <action android:name="android.intent.action.MAIN" />
11
+ <category android:name="android.intent.category.LAUNCHER" />
12
+ </intent-filter>
13
+ </activity>
14
+ </application>
15
+ </manifest>
multimodal/YOLOX/demo/ncnn/android/app/src/main/assets/yolox.param ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 7767517
2
+ 220 250
3
+ Input images 0 1 images
4
+ YoloV5Focus focus 1 1 images 503
5
+ Convolution Conv_41 1 1 503 877 0=32 1=3 4=1 5=1 6=3456
6
+ Swish Mul_43 1 1 877 507
7
+ Convolution Conv_44 1 1 507 880 0=64 1=3 3=2 4=1 5=1 6=18432
8
+ Swish Mul_46 1 1 880 511
9
+ Split splitncnn_0 1 2 511 511_splitncnn_0 511_splitncnn_1
10
+ Convolution Conv_47 1 1 511_splitncnn_1 883 0=32 1=1 5=1 6=2048
11
+ Swish Mul_49 1 1 883 515
12
+ Split splitncnn_1 1 2 515 515_splitncnn_0 515_splitncnn_1
13
+ Convolution Conv_50 1 1 511_splitncnn_0 886 0=32 1=1 5=1 6=2048
14
+ Swish Mul_52 1 1 886 519
15
+ Convolution Conv_53 1 1 515_splitncnn_1 889 0=32 1=1 5=1 6=1024
16
+ Swish Mul_55 1 1 889 523
17
+ Convolution Conv_56 1 1 523 892 0=32 1=3 4=1 5=1 6=9216
18
+ Swish Mul_58 1 1 892 527
19
+ BinaryOp Add_59 2 1 527 515_splitncnn_0 528
20
+ Concat Concat_60 2 1 528 519 529
21
+ Convolution Conv_61 1 1 529 895 0=64 1=1 5=1 6=4096
22
+ Swish Mul_63 1 1 895 533
23
+ Convolution Conv_64 1 1 533 898 0=128 1=3 3=2 4=1 5=1 6=73728
24
+ Swish Mul_66 1 1 898 537
25
+ Split splitncnn_2 1 2 537 537_splitncnn_0 537_splitncnn_1
26
+ Convolution Conv_67 1 1 537_splitncnn_1 901 0=64 1=1 5=1 6=8192
27
+ Swish Mul_69 1 1 901 541
28
+ Split splitncnn_3 1 2 541 541_splitncnn_0 541_splitncnn_1
29
+ Convolution Conv_70 1 1 537_splitncnn_0 904 0=64 1=1 5=1 6=8192
30
+ Swish Mul_72 1 1 904 545
31
+ Convolution Conv_73 1 1 541_splitncnn_1 907 0=64 1=1 5=1 6=4096
32
+ Swish Mul_75 1 1 907 549
33
+ Convolution Conv_76 1 1 549 910 0=64 1=3 4=1 5=1 6=36864
34
+ Swish Mul_78 1 1 910 553
35
+ BinaryOp Add_79 2 1 553 541_splitncnn_0 554
36
+ Split splitncnn_4 1 2 554 554_splitncnn_0 554_splitncnn_1
37
+ Convolution Conv_80 1 1 554_splitncnn_1 913 0=64 1=1 5=1 6=4096
38
+ Swish Mul_82 1 1 913 558
39
+ Convolution Conv_83 1 1 558 916 0=64 1=3 4=1 5=1 6=36864
40
+ Swish Mul_85 1 1 916 562
41
+ BinaryOp Add_86 2 1 562 554_splitncnn_0 563
42
+ Split splitncnn_5 1 2 563 563_splitncnn_0 563_splitncnn_1
43
+ Convolution Conv_87 1 1 563_splitncnn_1 919 0=64 1=1 5=1 6=4096
44
+ Swish Mul_89 1 1 919 567
45
+ Convolution Conv_90 1 1 567 922 0=64 1=3 4=1 5=1 6=36864
46
+ Swish Mul_92 1 1 922 571
47
+ BinaryOp Add_93 2 1 571 563_splitncnn_0 572
48
+ Concat Concat_94 2 1 572 545 573
49
+ Convolution Conv_95 1 1 573 925 0=128 1=1 5=1 6=16384
50
+ Swish Mul_97 1 1 925 577
51
+ Split splitncnn_6 1 2 577 577_splitncnn_0 577_splitncnn_1
52
+ Convolution Conv_98 1 1 577_splitncnn_1 928 0=256 1=3 3=2 4=1 5=1 6=294912
53
+ Swish Mul_100 1 1 928 581
54
+ Split splitncnn_7 1 2 581 581_splitncnn_0 581_splitncnn_1
55
+ Convolution Conv_101 1 1 581_splitncnn_1 931 0=128 1=1 5=1 6=32768
56
+ Swish Mul_103 1 1 931 585
57
+ Split splitncnn_8 1 2 585 585_splitncnn_0 585_splitncnn_1
58
+ Convolution Conv_104 1 1 581_splitncnn_0 934 0=128 1=1 5=1 6=32768
59
+ Swish Mul_106 1 1 934 589
60
+ Convolution Conv_107 1 1 585_splitncnn_1 937 0=128 1=1 5=1 6=16384
61
+ Swish Mul_109 1 1 937 593
62
+ Convolution Conv_110 1 1 593 940 0=128 1=3 4=1 5=1 6=147456
63
+ Swish Mul_112 1 1 940 597
64
+ BinaryOp Add_113 2 1 597 585_splitncnn_0 598
65
+ Split splitncnn_9 1 2 598 598_splitncnn_0 598_splitncnn_1
66
+ Convolution Conv_114 1 1 598_splitncnn_1 943 0=128 1=1 5=1 6=16384
67
+ Swish Mul_116 1 1 943 602
68
+ Convolution Conv_117 1 1 602 946 0=128 1=3 4=1 5=1 6=147456
69
+ Swish Mul_119 1 1 946 606
70
+ BinaryOp Add_120 2 1 606 598_splitncnn_0 607
71
+ Split splitncnn_10 1 2 607 607_splitncnn_0 607_splitncnn_1
72
+ Convolution Conv_121 1 1 607_splitncnn_1 949 0=128 1=1 5=1 6=16384
73
+ Swish Mul_123 1 1 949 611
74
+ Convolution Conv_124 1 1 611 952 0=128 1=3 4=1 5=1 6=147456
75
+ Swish Mul_126 1 1 952 615
76
+ BinaryOp Add_127 2 1 615 607_splitncnn_0 616
77
+ Concat Concat_128 2 1 616 589 617
78
+ Convolution Conv_129 1 1 617 955 0=256 1=1 5=1 6=65536
79
+ Swish Mul_131 1 1 955 621
80
+ Split splitncnn_11 1 2 621 621_splitncnn_0 621_splitncnn_1
81
+ Convolution Conv_132 1 1 621_splitncnn_1 958 0=512 1=3 3=2 4=1 5=1 6=1179648
82
+ Swish Mul_134 1 1 958 625
83
+ Convolution Conv_135 1 1 625 961 0=256 1=1 5=1 6=131072
84
+ Swish Mul_137 1 1 961 629
85
+ Split splitncnn_12 1 4 629 629_splitncnn_0 629_splitncnn_1 629_splitncnn_2 629_splitncnn_3
86
+ Pooling MaxPool_138 1 1 629_splitncnn_3 630 1=5 3=2 5=1
87
+ Pooling MaxPool_139 1 1 629_splitncnn_2 631 1=9 3=4 5=1
88
+ Pooling MaxPool_140 1 1 629_splitncnn_1 632 1=13 3=6 5=1
89
+ Concat Concat_141 4 1 629_splitncnn_0 630 631 632 633
90
+ Convolution Conv_142 1 1 633 964 0=512 1=1 5=1 6=524288
91
+ Swish Mul_144 1 1 964 637
92
+ Split splitncnn_13 1 2 637 637_splitncnn_0 637_splitncnn_1
93
+ Convolution Conv_145 1 1 637_splitncnn_1 967 0=256 1=1 5=1 6=131072
94
+ Swish Mul_147 1 1 967 641
95
+ Convolution Conv_148 1 1 637_splitncnn_0 970 0=256 1=1 5=1 6=131072
96
+ Swish Mul_150 1 1 970 645
97
+ Convolution Conv_151 1 1 641 973 0=256 1=1 5=1 6=65536
98
+ Swish Mul_153 1 1 973 649
99
+ Convolution Conv_154 1 1 649 976 0=256 1=3 4=1 5=1 6=589824
100
+ Swish Mul_156 1 1 976 653
101
+ Concat Concat_157 2 1 653 645 654
102
+ Convolution Conv_158 1 1 654 979 0=512 1=1 5=1 6=262144
103
+ Swish Mul_160 1 1 979 658
104
+ Convolution Conv_161 1 1 658 982 0=256 1=1 5=1 6=131072
105
+ Swish Mul_163 1 1 982 662
106
+ Split splitncnn_14 1 2 662 662_splitncnn_0 662_splitncnn_1
107
+ Interp Resize_165 1 1 662_splitncnn_1 667 0=1 1=2.000000e+00 2=2.000000e+00
108
+ Concat Concat_166 2 1 667 621_splitncnn_0 668
109
+ Split splitncnn_15 1 2 668 668_splitncnn_0 668_splitncnn_1
110
+ Convolution Conv_167 1 1 668_splitncnn_1 985 0=128 1=1 5=1 6=65536
111
+ Swish Mul_169 1 1 985 672
112
+ Convolution Conv_170 1 1 668_splitncnn_0 988 0=128 1=1 5=1 6=65536
113
+ Swish Mul_172 1 1 988 676
114
+ Convolution Conv_173 1 1 672 991 0=128 1=1 5=1 6=16384
115
+ Swish Mul_175 1 1 991 680
116
+ Convolution Conv_176 1 1 680 994 0=128 1=3 4=1 5=1 6=147456
117
+ Swish Mul_178 1 1 994 684
118
+ Concat Concat_179 2 1 684 676 685
119
+ Convolution Conv_180 1 1 685 997 0=256 1=1 5=1 6=65536
120
+ Swish Mul_182 1 1 997 689
121
+ Convolution Conv_183 1 1 689 1000 0=128 1=1 5=1 6=32768
122
+ Swish Mul_185 1 1 1000 693
123
+ Split splitncnn_16 1 2 693 693_splitncnn_0 693_splitncnn_1
124
+ Interp Resize_187 1 1 693_splitncnn_1 698 0=1 1=2.000000e+00 2=2.000000e+00
125
+ Concat Concat_188 2 1 698 577_splitncnn_0 699
126
+ Split splitncnn_17 1 2 699 699_splitncnn_0 699_splitncnn_1
127
+ Convolution Conv_189 1 1 699_splitncnn_1 1003 0=64 1=1 5=1 6=16384
128
+ Swish Mul_191 1 1 1003 703
129
+ Convolution Conv_192 1 1 699_splitncnn_0 1006 0=64 1=1 5=1 6=16384
130
+ Swish Mul_194 1 1 1006 707
131
+ Convolution Conv_195 1 1 703 1009 0=64 1=1 5=1 6=4096
132
+ Swish Mul_197 1 1 1009 711
133
+ Convolution Conv_198 1 1 711 1012 0=64 1=3 4=1 5=1 6=36864
134
+ Swish Mul_200 1 1 1012 715
135
+ Concat Concat_201 2 1 715 707 716
136
+ Convolution Conv_202 1 1 716 1015 0=128 1=1 5=1 6=16384
137
+ Swish Mul_204 1 1 1015 720
138
+ Split splitncnn_18 1 2 720 720_splitncnn_0 720_splitncnn_1
139
+ Convolution Conv_205 1 1 720_splitncnn_1 1018 0=128 1=3 3=2 4=1 5=1 6=147456
140
+ Swish Mul_207 1 1 1018 724
141
+ Concat Concat_208 2 1 724 693_splitncnn_0 725
142
+ Split splitncnn_19 1 2 725 725_splitncnn_0 725_splitncnn_1
143
+ Convolution Conv_209 1 1 725_splitncnn_1 1021 0=128 1=1 5=1 6=32768
144
+ Swish Mul_211 1 1 1021 729
145
+ Convolution Conv_212 1 1 725_splitncnn_0 1024 0=128 1=1 5=1 6=32768
146
+ Swish Mul_214 1 1 1024 733
147
+ Convolution Conv_215 1 1 729 1027 0=128 1=1 5=1 6=16384
148
+ Swish Mul_217 1 1 1027 737
149
+ Convolution Conv_218 1 1 737 1030 0=128 1=3 4=1 5=1 6=147456
150
+ Swish Mul_220 1 1 1030 741
151
+ Concat Concat_221 2 1 741 733 742
152
+ Convolution Conv_222 1 1 742 1033 0=256 1=1 5=1 6=65536
153
+ Swish Mul_224 1 1 1033 746
154
+ Split splitncnn_20 1 2 746 746_splitncnn_0 746_splitncnn_1
155
+ Convolution Conv_225 1 1 746_splitncnn_1 1036 0=256 1=3 3=2 4=1 5=1 6=589824
156
+ Swish Mul_227 1 1 1036 750
157
+ Concat Concat_228 2 1 750 662_splitncnn_0 751
158
+ Split splitncnn_21 1 2 751 751_splitncnn_0 751_splitncnn_1
159
+ Convolution Conv_229 1 1 751_splitncnn_1 1039 0=256 1=1 5=1 6=131072
160
+ Swish Mul_231 1 1 1039 755
161
+ Convolution Conv_232 1 1 751_splitncnn_0 1042 0=256 1=1 5=1 6=131072
162
+ Swish Mul_234 1 1 1042 759
163
+ Convolution Conv_235 1 1 755 1045 0=256 1=1 5=1 6=65536
164
+ Swish Mul_237 1 1 1045 763
165
+ Convolution Conv_238 1 1 763 1048 0=256 1=3 4=1 5=1 6=589824
166
+ Swish Mul_240 1 1 1048 767
167
+ Concat Concat_241 2 1 767 759 768
168
+ Convolution Conv_242 1 1 768 1051 0=512 1=1 5=1 6=262144
169
+ Swish Mul_244 1 1 1051 772
170
+ Convolution Conv_245 1 1 720_splitncnn_0 1054 0=128 1=1 5=1 6=16384
171
+ Swish Mul_247 1 1 1054 776
172
+ Split splitncnn_22 1 2 776 776_splitncnn_0 776_splitncnn_1
173
+ Convolution Conv_248 1 1 776_splitncnn_1 1057 0=128 1=3 4=1 5=1 6=147456
174
+ Swish Mul_250 1 1 1057 780
175
+ Convolution Conv_251 1 1 780 1060 0=128 1=3 4=1 5=1 6=147456
176
+ Swish Mul_253 1 1 1060 784
177
+ Convolution Conv_254 1 1 784 797 0=80 1=1 5=1 6=10240 9=4
178
+ Convolution Conv_255 1 1 776_splitncnn_0 1063 0=128 1=3 4=1 5=1 6=147456
179
+ Swish Mul_257 1 1 1063 789
180
+ Convolution Conv_258 1 1 789 1066 0=128 1=3 4=1 5=1 6=147456
181
+ Swish Mul_260 1 1 1066 793
182
+ Split splitncnn_23 1 2 793 793_splitncnn_0 793_splitncnn_1
183
+ Convolution Conv_261 1 1 793_splitncnn_1 794 0=4 1=1 5=1 6=512
184
+ Convolution Conv_262 1 1 793_splitncnn_0 796 0=1 1=1 5=1 6=128 9=4
185
+ Concat Concat_265 3 1 794 796 797 798
186
+ Convolution Conv_266 1 1 746_splitncnn_0 1069 0=128 1=1 5=1 6=32768
187
+ Swish Mul_268 1 1 1069 802
188
+ Split splitncnn_24 1 2 802 802_splitncnn_0 802_splitncnn_1
189
+ Convolution Conv_269 1 1 802_splitncnn_1 1072 0=128 1=3 4=1 5=1 6=147456
190
+ Swish Mul_271 1 1 1072 806
191
+ Convolution Conv_272 1 1 806 1075 0=128 1=3 4=1 5=1 6=147456
192
+ Swish Mul_274 1 1 1075 810
193
+ Convolution Conv_275 1 1 810 823 0=80 1=1 5=1 6=10240 9=4
194
+ Convolution Conv_276 1 1 802_splitncnn_0 1078 0=128 1=3 4=1 5=1 6=147456
195
+ Swish Mul_278 1 1 1078 815
196
+ Convolution Conv_279 1 1 815 1081 0=128 1=3 4=1 5=1 6=147456
197
+ Swish Mul_281 1 1 1081 819
198
+ Split splitncnn_25 1 2 819 819_splitncnn_0 819_splitncnn_1
199
+ Convolution Conv_282 1 1 819_splitncnn_1 820 0=4 1=1 5=1 6=512
200
+ Convolution Conv_283 1 1 819_splitncnn_0 822 0=1 1=1 5=1 6=128 9=4
201
+ Concat Concat_286 3 1 820 822 823 824
202
+ Convolution Conv_287 1 1 772 1084 0=128 1=1 5=1 6=65536
203
+ Swish Mul_289 1 1 1084 828
204
+ Split splitncnn_26 1 2 828 828_splitncnn_0 828_splitncnn_1
205
+ Convolution Conv_290 1 1 828_splitncnn_1 1087 0=128 1=3 4=1 5=1 6=147456
206
+ Swish Mul_292 1 1 1087 832
207
+ Convolution Conv_293 1 1 832 1090 0=128 1=3 4=1 5=1 6=147456
208
+ Swish Mul_295 1 1 1090 836
209
+ Convolution Conv_296 1 1 836 849 0=80 1=1 5=1 6=10240 9=4
210
+ Convolution Conv_297 1 1 828_splitncnn_0 1093 0=128 1=3 4=1 5=1 6=147456
211
+ Swish Mul_299 1 1 1093 841
212
+ Convolution Conv_300 1 1 841 1096 0=128 1=3 4=1 5=1 6=147456
213
+ Swish Mul_302 1 1 1096 845
214
+ Split splitncnn_27 1 2 845 845_splitncnn_0 845_splitncnn_1
215
+ Convolution Conv_303 1 1 845_splitncnn_1 846 0=4 1=1 5=1 6=512
216
+ Convolution Conv_304 1 1 845_splitncnn_0 848 0=1 1=1 5=1 6=128 9=4
217
+ Concat Concat_307 3 1 846 848 849 850
218
+ Reshape Reshape_315 1 1 798 858 0=-1 1=85
219
+ Reshape Reshape_323 1 1 824 866 0=-1 1=85
220
+ Reshape Reshape_331 1 1 850 874 0=-1 1=85
221
+ Concat Concat_332 3 1 858 866 874 875 0=1
222
+ Permute Transpose_333 1 1 875 output 0=1
multimodal/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/MainActivity.java ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Some code in this file is based on:
2
+ // https://github.com/nihui/ncnn-android-yolov5/blob/master/app/src/main/java/com/tencent/yolov5ncnn/MainActivity.java
3
+ // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
+ // Copyright (C) Megvii, Inc. and its affiliates. All rights reserved.
5
+
6
+ package com.megvii.yoloXncnn;
7
+
8
+ import android.app.Activity;
9
+ import android.content.Intent;
10
+ import android.graphics.Bitmap;
11
+ import android.graphics.BitmapFactory;
12
+ import android.graphics.Canvas;
13
+ import android.graphics.Color;
14
+ import android.graphics.Paint;
15
+ import android.media.ExifInterface;
16
+ import android.graphics.Matrix;
17
+ import android.net.Uri;
18
+ import android.os.Bundle;
19
+ import android.util.Log;
20
+ import android.view.View;
21
+ import android.widget.Button;
22
+ import android.widget.ImageView;
23
+
24
+ import java.io.FileNotFoundException;
25
+ import java.io.InputStream;
26
+ import java.io.IOException;
27
+
28
+ public class MainActivity extends Activity
29
+ {
30
+ private static final int SELECT_IMAGE = 1;
31
+
32
+ private ImageView imageView;
33
+ private Bitmap bitmap = null;
34
+ private Bitmap yourSelectedImage = null;
35
+
36
+ private YOLOXncnn yoloX = new YOLOXncnn();
37
+
38
+ /** Called when the activity is first created. */
39
+ @Override
40
+ public void onCreate(Bundle savedInstanceState)
41
+ {
42
+ super.onCreate(savedInstanceState);
43
+ setContentView(R.layout.main);
44
+
45
+ boolean ret_init = yoloX.Init(getAssets());
46
+ if (!ret_init)
47
+ {
48
+ Log.e("MainActivity", "yoloXncnn Init failed");
49
+ }
50
+
51
+ imageView = (ImageView) findViewById(R.id.imageView);
52
+
53
+ Button buttonImage = (Button) findViewById(R.id.buttonImage);
54
+ buttonImage.setOnClickListener(new View.OnClickListener() {
55
+ @Override
56
+ public void onClick(View arg0) {
57
+ Intent i = new Intent(Intent.ACTION_PICK);
58
+ i.setType("image/*");
59
+ startActivityForResult(i, SELECT_IMAGE);
60
+ }
61
+ });
62
+
63
+ Button buttonDetect = (Button) findViewById(R.id.buttonDetect);
64
+ buttonDetect.setOnClickListener(new View.OnClickListener() {
65
+ @Override
66
+ public void onClick(View arg0) {
67
+ if (yourSelectedImage == null)
68
+ return;
69
+ YOLOXncnn.Obj[] objects = yoloX.Detect(yourSelectedImage, false);
70
+
71
+ showObjects(objects);
72
+ }
73
+ });
74
+
75
+ Button buttonDetectGPU = (Button) findViewById(R.id.buttonDetectGPU);
76
+ buttonDetectGPU.setOnClickListener(new View.OnClickListener() {
77
+ @Override
78
+ public void onClick(View arg0) {
79
+ if (yourSelectedImage == null)
80
+ return;
81
+
82
+ YOLOXncnn.Obj[] objects = yoloX.Detect(yourSelectedImage, true);
83
+
84
+ showObjects(objects);
85
+ }
86
+ });
87
+ }
88
+
89
+ private void showObjects(YOLOXncnn.Obj[] objects)
90
+ {
91
+ if (objects == null)
92
+ {
93
+ imageView.setImageBitmap(bitmap);
94
+ return;
95
+ }
96
+
97
+ // draw objects on bitmap
98
+ Bitmap rgba = bitmap.copy(Bitmap.Config.ARGB_8888, true);
99
+
100
+ final int[] colors = new int[] {
101
+ Color.rgb( 54, 67, 244),
102
+ Color.rgb( 99, 30, 233),
103
+ Color.rgb(176, 39, 156),
104
+ Color.rgb(183, 58, 103),
105
+ Color.rgb(181, 81, 63),
106
+ Color.rgb(243, 150, 33),
107
+ Color.rgb(244, 169, 3),
108
+ Color.rgb(212, 188, 0),
109
+ Color.rgb(136, 150, 0),
110
+ Color.rgb( 80, 175, 76),
111
+ Color.rgb( 74, 195, 139),
112
+ Color.rgb( 57, 220, 205),
113
+ Color.rgb( 59, 235, 255),
114
+ Color.rgb( 7, 193, 255),
115
+ Color.rgb( 0, 152, 255),
116
+ Color.rgb( 34, 87, 255),
117
+ Color.rgb( 72, 85, 121),
118
+ Color.rgb(158, 158, 158),
119
+ Color.rgb(139, 125, 96)
120
+ };
121
+
122
+ Canvas canvas = new Canvas(rgba);
123
+
124
+ Paint paint = new Paint();
125
+ paint.setStyle(Paint.Style.STROKE);
126
+ paint.setStrokeWidth(4);
127
+
128
+ Paint textbgpaint = new Paint();
129
+ textbgpaint.setColor(Color.WHITE);
130
+ textbgpaint.setStyle(Paint.Style.FILL);
131
+
132
+ Paint textpaint = new Paint();
133
+ textpaint.setColor(Color.BLACK);
134
+ textpaint.setTextSize(26);
135
+ textpaint.setTextAlign(Paint.Align.LEFT);
136
+
137
+ for (int i = 0; i < objects.length; i++)
138
+ {
139
+ paint.setColor(colors[i % 19]);
140
+
141
+ canvas.drawRect(objects[i].x, objects[i].y, objects[i].x + objects[i].w, objects[i].y + objects[i].h, paint);
142
+
143
+ // draw filled text inside image
144
+ {
145
+ String text = objects[i].label + " = " + String.format("%.1f", objects[i].prob * 100) + "%";
146
+
147
+ float text_width = textpaint.measureText(text);
148
+ float text_height = - textpaint.ascent() + textpaint.descent();
149
+
150
+ float x = objects[i].x;
151
+ float y = objects[i].y - text_height;
152
+ if (y < 0)
153
+ y = 0;
154
+ if (x + text_width > rgba.getWidth())
155
+ x = rgba.getWidth() - text_width;
156
+
157
+ canvas.drawRect(x, y, x + text_width, y + text_height, textbgpaint);
158
+
159
+ canvas.drawText(text, x, y - textpaint.ascent(), textpaint);
160
+ }
161
+ }
162
+
163
+ imageView.setImageBitmap(rgba);
164
+ }
165
+
166
+ @Override
167
+ protected void onActivityResult(int requestCode, int resultCode, Intent data)
168
+ {
169
+ super.onActivityResult(requestCode, resultCode, data);
170
+
171
+ if (resultCode == RESULT_OK && null != data) {
172
+ Uri selectedImage = data.getData();
173
+
174
+ try
175
+ {
176
+ if (requestCode == SELECT_IMAGE) {
177
+ bitmap = decodeUri(selectedImage);
178
+
179
+ yourSelectedImage = bitmap.copy(Bitmap.Config.ARGB_8888, true);
180
+
181
+ imageView.setImageBitmap(bitmap);
182
+ }
183
+ }
184
+ catch (FileNotFoundException e)
185
+ {
186
+ Log.e("MainActivity", "FileNotFoundException");
187
+ return;
188
+ }
189
+ }
190
+ }
191
+
192
+ private Bitmap decodeUri(Uri selectedImage) throws FileNotFoundException
193
+ {
194
+ // Decode image size
195
+ BitmapFactory.Options o = new BitmapFactory.Options();
196
+ o.inJustDecodeBounds = true;
197
+ BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o);
198
+
199
+ // The new size we want to scale to
200
+ final int REQUIRED_SIZE = 640;
201
+
202
+ // Find the correct scale value. It should be the power of 2.
203
+ int width_tmp = o.outWidth, height_tmp = o.outHeight;
204
+ int scale = 1;
205
+ while (true) {
206
+ if (width_tmp / 2 < REQUIRED_SIZE || height_tmp / 2 < REQUIRED_SIZE) {
207
+ break;
208
+ }
209
+ width_tmp /= 2;
210
+ height_tmp /= 2;
211
+ scale *= 2;
212
+ }
213
+
214
+ // Decode with inSampleSize
215
+ BitmapFactory.Options o2 = new BitmapFactory.Options();
216
+ o2.inSampleSize = scale;
217
+ Bitmap bitmap = BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o2);
218
+
219
+ // Rotate according to EXIF
220
+ int rotate = 0;
221
+ try
222
+ {
223
+ ExifInterface exif = new ExifInterface(getContentResolver().openInputStream(selectedImage));
224
+ int orientation = exif.getAttributeInt(ExifInterface.TAG_ORIENTATION, ExifInterface.ORIENTATION_NORMAL);
225
+ switch (orientation) {
226
+ case ExifInterface.ORIENTATION_ROTATE_270:
227
+ rotate = 270;
228
+ break;
229
+ case ExifInterface.ORIENTATION_ROTATE_180:
230
+ rotate = 180;
231
+ break;
232
+ case ExifInterface.ORIENTATION_ROTATE_90:
233
+ rotate = 90;
234
+ break;
235
+ }
236
+ }
237
+ catch (IOException e)
238
+ {
239
+ Log.e("MainActivity", "ExifInterface IOException");
240
+ }
241
+
242
+ Matrix matrix = new Matrix();
243
+ matrix.postRotate(rotate);
244
+ return Bitmap.createBitmap(bitmap, 0, 0, bitmap.getWidth(), bitmap.getHeight(), matrix, true);
245
+ }
246
+
247
+ }
multimodal/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/YOLOXncnn.java ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) Megvii, Inc. and its affiliates. All rights reserved.
2
+
3
+ package com.megvii.yoloXncnn;
4
+
5
+ import android.content.res.AssetManager;
6
+ import android.graphics.Bitmap;
7
+
8
+ public class YOLOXncnn
9
+ {
10
+ public native boolean Init(AssetManager mgr);
11
+
12
+ public class Obj
13
+ {
14
+ public float x;
15
+ public float y;
16
+ public float w;
17
+ public float h;
18
+ public String label;
19
+ public float prob;
20
+ }
21
+
22
+ public native Obj[] Detect(Bitmap bitmap, boolean use_gpu);
23
+
24
+ static {
25
+ System.loadLibrary("yoloXncnn");
26
+ }
27
+ }
multimodal/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/yoloXncnn.java ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) Megvii, Inc. and its affiliates. All rights reserved.
2
+
3
+ package com.megvii.yoloXncnn;
4
+
5
+ import android.content.res.AssetManager;
6
+ import android.graphics.Bitmap;
7
+
8
+ public class YOLOXncnn
9
+ {
10
+ public native boolean Init(AssetManager mgr);
11
+
12
+ public class Obj
13
+ {
14
+ public float x;
15
+ public float y;
16
+ public float w;
17
+ public float h;
18
+ public String label;
19
+ public float prob;
20
+ }
21
+
22
+ public native Obj[] Detect(Bitmap bitmap, boolean use_gpu);
23
+
24
+ static {
25
+ System.loadLibrary("yoloXncnn");
26
+ }
27
+ }
multimodal/YOLOX/demo/ncnn/android/app/src/main/jni/CMakeLists.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ project(yoloXncnn)
2
+
3
+ cmake_minimum_required(VERSION 3.4.1)
4
+
5
+ set(ncnn_DIR ${CMAKE_SOURCE_DIR}/ncnn-20210525-android-vulkan/${ANDROID_ABI}/lib/cmake/ncnn)
6
+ find_package(ncnn REQUIRED)
7
+
8
+ add_library(yoloXncnn SHARED yoloXncnn_jni.cpp)
9
+
10
+ target_link_libraries(yoloXncnn
11
+ ncnn
12
+
13
+ jnigraphics
14
+ )
multimodal/YOLOX/demo/ncnn/android/app/src/main/jni/yoloXncnn_jni.cpp ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Some code in this file is based on:
2
+ // https://github.com/nihui/ncnn-android-yolov5/blob/master/app/src/main/jni/yolov5ncnn_jni.cpp
3
+ // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4
+ // Copyright (C) Megvii, Inc. and its affiliates. All rights reserved.
5
+
6
+ #include <android/asset_manager_jni.h>
7
+ #include <android/bitmap.h>
8
+ #include <android/log.h>
9
+
10
+ #include <jni.h>
11
+
12
+ #include <string>
13
+ #include <vector>
14
+
15
+ // ncnn
16
+ #include "layer.h"
17
+ #include "net.h"
18
+ #include "benchmark.h"
19
+
20
+ static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
21
+ static ncnn::PoolAllocator g_workspace_pool_allocator;
22
+
23
+ static ncnn::Net yoloX;
24
+
25
+ class YoloV5Focus : public ncnn::Layer
26
+ {
27
+ public:
28
+ YoloV5Focus()
29
+ {
30
+ one_blob_only = true;
31
+ }
32
+
33
+ virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
34
+ {
35
+ int w = bottom_blob.w;
36
+ int h = bottom_blob.h;
37
+ int channels = bottom_blob.c;
38
+
39
+ int outw = w / 2;
40
+ int outh = h / 2;
41
+ int outc = channels * 4;
42
+
43
+ top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
44
+ if (top_blob.empty())
45
+ return -100;
46
+
47
+ #pragma omp parallel for num_threads(opt.num_threads)
48
+ for (int p = 0; p < outc; p++)
49
+ {
50
+ const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
51
+ float* outptr = top_blob.channel(p);
52
+
53
+ for (int i = 0; i < outh; i++)
54
+ {
55
+ for (int j = 0; j < outw; j++)
56
+ {
57
+ *outptr = *ptr;
58
+
59
+ outptr += 1;
60
+ ptr += 2;
61
+ }
62
+
63
+ ptr += w;
64
+ }
65
+ }
66
+
67
+ return 0;
68
+ }
69
+ };
70
+
71
+ DEFINE_LAYER_CREATOR(YoloV5Focus)
72
+
73
+ struct Object
74
+ {
75
+ float x;
76
+ float y;
77
+ float w;
78
+ float h;
79
+ int label;
80
+ float prob;
81
+ };
82
+
83
+ struct GridAndStride
84
+ {
85
+ int grid0;
86
+ int grid1;
87
+ int stride;
88
+ };
89
+
90
+ static inline float intersection_area(const Object& a, const Object& b)
91
+ {
92
+ if (a.x > b.x + b.w || a.x + a.w < b.x || a.y > b.y + b.h || a.y + a.h < b.y)
93
+ {
94
+ // no intersection
95
+ return 0.f;
96
+ }
97
+
98
+ float inter_width = std::min(a.x + a.w, b.x + b.w) - std::max(a.x, b.x);
99
+ float inter_height = std::min(a.y + a.h, b.y + b.h) - std::max(a.y, b.y);
100
+
101
+ return inter_width * inter_height;
102
+ }
103
+
104
+ static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
105
+ {
106
+ int i = left;
107
+ int j = right;
108
+ float p = faceobjects[(left + right) / 2].prob;
109
+
110
+ while (i <= j)
111
+ {
112
+ while (faceobjects[i].prob > p)
113
+ i++;
114
+
115
+ while (faceobjects[j].prob < p)
116
+ j--;
117
+
118
+ if (i <= j)
119
+ {
120
+ // swap
121
+ std::swap(faceobjects[i], faceobjects[j]);
122
+
123
+ i++;
124
+ j--;
125
+ }
126
+ }
127
+
128
+ #pragma omp parallel sections
129
+ {
130
+ #pragma omp section
131
+ {
132
+ if (left < j) qsort_descent_inplace(faceobjects, left, j);
133
+ }
134
+ #pragma omp section
135
+ {
136
+ if (i < right) qsort_descent_inplace(faceobjects, i, right);
137
+ }
138
+ }
139
+ }
140
+
141
+ static void qsort_descent_inplace(std::vector<Object>& faceobjects)
142
+ {
143
+ if (faceobjects.empty())
144
+ return;
145
+
146
+ qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
147
+ }
148
+
149
+ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
150
+ {
151
+ picked.clear();
152
+
153
+ const int n = faceobjects.size();
154
+
155
+ std::vector<float> areas(n);
156
+ for (int i = 0; i < n; i++)
157
+ {
158
+ areas[i] = faceobjects[i].w * faceobjects[i].h;
159
+ }
160
+
161
+ for (int i = 0; i < n; i++)
162
+ {
163
+ const Object& a = faceobjects[i];
164
+
165
+ int keep = 1;
166
+ for (int j = 0; j < (int)picked.size(); j++)
167
+ {
168
+ const Object& b = faceobjects[picked[j]];
169
+
170
+ // intersection over union
171
+ float inter_area = intersection_area(a, b);
172
+ float union_area = areas[i] + areas[picked[j]] - inter_area;
173
+ // float IoU = inter_area / union_area
174
+ if (inter_area / union_area > nms_threshold)
175
+ keep = 0;
176
+ }
177
+
178
+ if (keep)
179
+ picked.push_back(i);
180
+ }
181
+ }
182
+
183
+ static void generate_grids_and_stride(const int target_size, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
184
+ {
185
+ for (auto stride : strides)
186
+ {
187
+ int num_grid = target_size / stride;
188
+ for (int g1 = 0; g1 < num_grid; g1++)
189
+ {
190
+ for (int g0 = 0; g0 < num_grid; g0++)
191
+ {
192
+ grid_strides.push_back((GridAndStride){g0, g1, stride});
193
+ }
194
+ }
195
+ }
196
+ }
197
+
198
+ static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
199
+ {
200
+ const int num_grid = feat_blob.h;
201
+ fprintf(stderr, "output height: %d, width: %d, channels: %d, dims:%d\n", feat_blob.h, feat_blob.w, feat_blob.c, feat_blob.dims);
202
+
203
+ const int num_class = feat_blob.w - 5;
204
+
205
+ const int num_anchors = grid_strides.size();
206
+
207
+ const float* feat_ptr = feat_blob.channel(0);
208
+ for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
209
+ {
210
+ const int grid0 = grid_strides[anchor_idx].grid0;
211
+ const int grid1 = grid_strides[anchor_idx].grid1;
212
+ const int stride = grid_strides[anchor_idx].stride;
213
+
214
+ // yolox/models/yolo_head.py decode logic
215
+ // outputs[..., :2] = (outputs[..., :2] + grids) * strides
216
+ // outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
217
+ float x_center = (feat_ptr[0] + grid0) * stride;
218
+ float y_center = (feat_ptr[1] + grid1) * stride;
219
+ float w = exp(feat_ptr[2]) * stride;
220
+ float h = exp(feat_ptr[3]) * stride;
221
+ float x0 = x_center - w * 0.5f;
222
+ float y0 = y_center - h * 0.5f;
223
+
224
+ float box_objectness = feat_ptr[4];
225
+ for (int class_idx = 0; class_idx < num_class; class_idx++)
226
+ {
227
+ float box_cls_score = feat_ptr[5 + class_idx];
228
+ float box_prob = box_objectness * box_cls_score;
229
+ if (box_prob > prob_threshold)
230
+ {
231
+ Object obj;
232
+ obj.x = x0;
233
+ obj.y = y0;
234
+ obj.w = w;
235
+ obj.h = h;
236
+ obj.label = class_idx;
237
+ obj.prob = box_prob;
238
+
239
+ objects.push_back(obj);
240
+ }
241
+
242
+ } // class loop
243
+ feat_ptr += feat_blob.w;
244
+
245
+ } // point anchor loop
246
+ }
247
+
248
+
249
+ extern "C" {
250
+
251
+ // FIXME DeleteGlobalRef is missing for objCls
252
+ static jclass objCls = NULL;
253
+ static jmethodID constructortorId;
254
+ static jfieldID xId;
255
+ static jfieldID yId;
256
+ static jfieldID wId;
257
+ static jfieldID hId;
258
+ static jfieldID labelId;
259
+ static jfieldID probId;
260
+
261
+ JNIEXPORT jint JNI_OnLoad(JavaVM* vm, void* reserved)
262
+ {
263
+ __android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "JNI_OnLoad");
264
+
265
+ ncnn::create_gpu_instance();
266
+
267
+ return JNI_VERSION_1_4;
268
+ }
269
+
270
+ JNIEXPORT void JNI_OnUnload(JavaVM* vm, void* reserved)
271
+ {
272
+ __android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "JNI_OnUnload");
273
+
274
+ ncnn::destroy_gpu_instance();
275
+ }
276
+
277
+ // public native boolean Init(AssetManager mgr);
278
+ JNIEXPORT jboolean JNICALL Java_com_megvii_yoloXncnn_YOLOXncnn_Init(JNIEnv* env, jobject thiz, jobject assetManager)
279
+ {
280
+ ncnn::Option opt;
281
+ opt.lightmode = true;
282
+ opt.num_threads = 4;
283
+ opt.blob_allocator = &g_blob_pool_allocator;
284
+ opt.workspace_allocator = &g_workspace_pool_allocator;
285
+ opt.use_packing_layout = true;
286
+
287
+ // use vulkan compute
288
+ if (ncnn::get_gpu_count() != 0)
289
+ opt.use_vulkan_compute = true;
290
+
291
+ AAssetManager* mgr = AAssetManager_fromJava(env, assetManager);
292
+
293
+ yoloX.opt = opt;
294
+
295
+ yoloX.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);
296
+
297
+ // init param
298
+ {
299
+ int ret = yoloX.load_param(mgr, "yolox.param");
300
+ if (ret != 0)
301
+ {
302
+ __android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "load_param failed");
303
+ return JNI_FALSE;
304
+ }
305
+ }
306
+
307
+ // init bin
308
+ {
309
+ int ret = yoloX.load_model(mgr, "yolox.bin");
310
+ if (ret != 0)
311
+ {
312
+ __android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "load_model failed");
313
+ return JNI_FALSE;
314
+ }
315
+ }
316
+
317
+ // init jni glue
318
+ jclass localObjCls = env->FindClass("com/megvii/yoloXncnn/YOLOXncnn$Obj");
319
+ objCls = reinterpret_cast<jclass>(env->NewGlobalRef(localObjCls));
320
+
321
+ constructortorId = env->GetMethodID(objCls, "<init>", "(Lcom/megvii/yoloXncnn/YOLOXncnn;)V");
322
+
323
+ xId = env->GetFieldID(objCls, "x", "F");
324
+ yId = env->GetFieldID(objCls, "y", "F");
325
+ wId = env->GetFieldID(objCls, "w", "F");
326
+ hId = env->GetFieldID(objCls, "h", "F");
327
+ labelId = env->GetFieldID(objCls, "label", "Ljava/lang/String;");
328
+ probId = env->GetFieldID(objCls, "prob", "F");
329
+
330
+ return JNI_TRUE;
331
+ }
332
+
333
+ // public native Obj[] Detect(Bitmap bitmap, boolean use_gpu);
334
+ JNIEXPORT jobjectArray JNICALL Java_com_megvii_yoloXncnn_YOLOXncnn_Detect(JNIEnv* env, jobject thiz, jobject bitmap, jboolean use_gpu)
335
+ {
336
+ if (use_gpu == JNI_TRUE && ncnn::get_gpu_count() == 0)
337
+ {
338
+ return NULL;
339
+ //return env->NewStringUTF("no vulkan capable gpu");
340
+ }
341
+
342
+ double start_time = ncnn::get_current_time();
343
+
344
+ AndroidBitmapInfo info;
345
+ AndroidBitmap_getInfo(env, bitmap, &info);
346
+ const int width = info.width;
347
+ const int height = info.height;
348
+ if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888)
349
+ return NULL;
350
+
351
+ // parameters which might change for different model
352
+ const int target_size = 640;
353
+ const float prob_threshold = 0.3f;
354
+ const float nms_threshold = 0.65f;
355
+ std::vector<int> strides = {8, 16, 32}; // might have stride=64
356
+
357
+ int w = width;
358
+ int h = height;
359
+ float scale = 1.f;
360
+ if (w > h)
361
+ {
362
+ scale = (float)target_size / w;
363
+ w = target_size;
364
+ h = h * scale;
365
+ }
366
+ else
367
+ {
368
+ scale = (float)target_size / h;
369
+ h = target_size;
370
+ w = w * scale;
371
+ }
372
+
373
+ ncnn::Mat in = ncnn::Mat::from_android_bitmap_resize(env, bitmap, ncnn::Mat::PIXEL_RGB2BGR, w, h);
374
+
375
+ // pad to target_size rectangle
376
+ int wpad = target_size - w;
377
+ int hpad = target_size - h;
378
+ ncnn::Mat in_pad;
379
+ // different from yolov5, yolox only pad on bottom and right side,
380
+ // which means users don't need to extra padding info to decode boxes coordinate.
381
+ ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f);
382
+
383
+ // yolox
384
+ std::vector<Object> objects;
385
+ {
386
+
387
+ ncnn::Extractor ex = yoloX.create_extractor();
388
+
389
+ ex.set_vulkan_compute(use_gpu);
390
+
391
+ ex.input("images", in_pad);
392
+
393
+ std::vector<Object> proposals;
394
+
395
+ // yolox decode and generate proposal logic
396
+ {
397
+ ncnn::Mat out;
398
+ ex.extract("output", out);
399
+
400
+ std::vector<GridAndStride> grid_strides;
401
+ generate_grids_and_stride(target_size, strides, grid_strides);
402
+ generate_yolox_proposals(grid_strides, out, prob_threshold, proposals);
403
+
404
+ }
405
+
406
+ // sort all proposals by score from highest to lowest
407
+ qsort_descent_inplace(proposals);
408
+
409
+ // apply nms with nms_threshold
410
+ std::vector<int> picked;
411
+ nms_sorted_bboxes(proposals, picked, nms_threshold);
412
+
413
+ int count = picked.size();
414
+
415
+ objects.resize(count);
416
+ for (int i = 0; i < count; i++)
417
+ {
418
+ objects[i] = proposals[picked[i]];
419
+
420
+ // adjust offset to original unpadded
421
+ float x0 = (objects[i].x) / scale;
422
+ float y0 = (objects[i].y) / scale;
423
+ float x1 = (objects[i].x + objects[i].w) / scale;
424
+ float y1 = (objects[i].y + objects[i].h) / scale;
425
+
426
+ // clip
427
+ x0 = std::max(std::min(x0, (float)(width - 1)), 0.f);
428
+ y0 = std::max(std::min(y0, (float)(height - 1)), 0.f);
429
+ x1 = std::max(std::min(x1, (float)(width - 1)), 0.f);
430
+ y1 = std::max(std::min(y1, (float)(height - 1)), 0.f);
431
+
432
+ objects[i].x = x0;
433
+ objects[i].y = y0;
434
+ objects[i].w = x1 - x0;
435
+ objects[i].h = y1 - y0;
436
+ }
437
+ }
438
+
439
+ // objects to Obj[]
440
+ static const char* class_names[] = {
441
+ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
442
+ "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
443
+ "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
444
+ "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
445
+ "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
446
+ "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
447
+ "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
448
+ "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
449
+ "hair drier", "toothbrush"
450
+ };
451
+
452
+ jobjectArray jObjArray = env->NewObjectArray(objects.size(), objCls, NULL);
453
+
454
+ for (size_t i=0; i<objects.size(); i++)
455
+ {
456
+ jobject jObj = env->NewObject(objCls, constructortorId, thiz);
457
+
458
+ env->SetFloatField(jObj, xId, objects[i].x);
459
+ env->SetFloatField(jObj, yId, objects[i].y);
460
+ env->SetFloatField(jObj, wId, objects[i].w);
461
+ env->SetFloatField(jObj, hId, objects[i].h);
462
+ env->SetObjectField(jObj, labelId, env->NewStringUTF(class_names[objects[i].label]));
463
+ env->SetFloatField(jObj, probId, objects[i].prob);
464
+
465
+ env->SetObjectArrayElement(jObjArray, i, jObj);
466
+ }
467
+
468
+ double elasped = ncnn::get_current_time() - start_time;
469
+ __android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "%.2fms detect", elasped);
470
+
471
+ return jObjArray;
472
+ }
473
+
474
+ }