from models import * from utils import * import os, sys, time, datetime, random import torch from torch.utils.data import DataLoader from torchvision import datasets, transforms from torch.autograd import Variable from PIL import Image # load weights and set defaults config_path='config/yolov3.cfg' weights_path='config/yolov3.weights' class_path='config/coco.names' img_size=416 conf_thres=0.8 nms_thres=0.4 # load model and put into eval mode model = Darknet(config_path, img_size=img_size) model.load_weights(weights_path) model.cuda() model.eval() classes = utils.load_classes(class_path) Tensor = torch.cuda.FloatTensor def detect_image(img): # scale and pad image ratio = min(img_size/img.size[0], img_size/img.size[1]) imw = round(img.size[0] * ratio) imh = round(img.size[1] * ratio) img_transforms = transforms.Compose([ transforms.Resize((imh, imw)), transforms.Pad((max(int((imh-imw)/2),0), max(int((imw-imh)/2),0), max(int((imh-imw)/2),0), max(int((imw-imh)/2),0)), (128,128,128)), transforms.ToTensor(), ]) # convert image to Tensor image_tensor = img_transforms(img).float() image_tensor = image_tensor.unsqueeze_(0) input_img = Variable(image_tensor.type(Tensor)) # run inference on the model and get detections with torch.no_grad(): detections = model(input_img) detections = utils.non_max_suppression(detections, 80, conf_thres, nms_thres) return detections[0] videopath = './videos/HorseRacing.mp4' import cv2 from sort import * colors=[(255,0,0),(0,255,0),(0,0,255),(255,0,255),(128,0,0),(0,128,0),(0,0,128),(128,0,128),(128,128,0),(0,128,128)] vid = cv2.VideoCapture(videopath) mot_tracker = Sort() cv2.namedWindow('Stream',cv2.WINDOW_NORMAL) cv2.resizeWindow('Stream', (800,600)) fourcc = cv2.VideoWriter_fourcc(*'XVID') ret,frame=vid.read() vw = frame.shape[1] vh = frame.shape[0] print ("Video size", vw,vh) outvideo = cv2.VideoWriter(videopath.replace(".mp4", "-det.mp4"),fourcc,20.0,(vw,vh)) frames = 0 starttime = time.time() while(True): ret, frame = vid.read() if not ret: break frames += 1 frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) pilimg = Image.fromarray(frame) detections = detect_image(pilimg) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) img = np.array(pilimg) pad_x = max(img.shape[0] - img.shape[1], 0) * (img_size / max(img.shape)) pad_y = max(img.shape[1] - img.shape[0], 0) * (img_size / max(img.shape)) unpad_h = img_size - pad_y unpad_w = img_size - pad_x if detections is not None: tracked_objects = mot_tracker.update(detections.cpu()) unique_labels = detections[:, -1].cpu().unique() n_cls_preds = len(unique_labels) for x1, y1, x2, y2, obj_id, cls_pred in tracked_objects: box_h = int(((y2 - y1) / unpad_h) * img.shape[0]) box_w = int(((x2 - x1) / unpad_w) * img.shape[1]) y1 = int(((y1 - pad_y // 2) / unpad_h) * img.shape[0]) x1 = int(((x1 - pad_x // 2) / unpad_w) * img.shape[1]) color = colors[int(obj_id) % len(colors)] cls = classes[int(cls_pred)] cv2.rectangle(frame, (x1, y1), (x1+box_w, y1+box_h), color, 4) cv2.rectangle(frame, (x1, y1-35), (x1+len(cls)*19+80, y1), color, -1) cv2.putText(frame, cls + "-" + str(int(obj_id)), (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 3) cv2.imshow('Stream', frame) outvideo.write(frame) ch = 0xFF & cv2.waitKey(1) if ch == 27: break totaltime = time.time()-starttime print(frames, "frames", totaltime/frames, "s/frame") cv2.destroyAllWindows() outvideo.release()