from CircumSpect.object_detection.groundingdino.util.inference import load_model, predict import CircumSpect.object_detection.groundingdino.datasets.transforms as T from torchvision.ops import box_convert from utils import setup_device from typing import Tuple, List import supervision as sv from io import BytesIO from PIL import Image import numpy as np import requests import torch import cv2 model = load_model("./CircumSpect/object_detection/groundingdino/config/GroundingDINO_SwinT_OGC.py", "./CircumSpect/object_detection/weights/groundingdino_swint_ogc.pth") def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]: transform = T.Compose( [ T.RandomResize([800], max_size=1333), T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ] ) image_source = Image.open(BytesIO(requests.get( image_path).content) if image_path.startswith("http") else image_path).convert("RGB") image = np.asarray(image_source) image_transformed, _ = transform(image_source, None) return image, image_transformed def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray: h, w, _ = image_source.shape boxes = boxes * torch.Tensor([w, h, w, h]) xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy() detections = sv.Detections(xyxy=xyxy) labels = [ f"{phrase} {logit:.2f}" for phrase, logit in zip(phrases, logits) ] coordinates = [] for i in list(detections): left, top, right, bottom = list(list(i)[0]) coordinates.append((int((left+right)/2), int((top+bottom)/2))) object_coordinates = [[" ".join(object_name.split()[:-1])+": "+object_name.split( )[-1]+"%", coordinate] for object_name, coordinate in zip(labels, coordinates)] box_annotator = sv.BoxAnnotator() annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR) annotated_frame = box_annotator.annotate( scene=annotated_frame, detections=detections, labels=labels) return annotated_frame, object_coordinates BOX_TRESHOLD = 0.35 TEXT_TRESHOLD = 0.25 device = setup_device() def locate_object(objects, image): image_source, image = load_image(image) boxes, logits, phrases = predict( model=model, image=image, device=device, caption=objects, box_threshold=BOX_TRESHOLD, text_threshold=TEXT_TRESHOLD ) annotated_frame, object_coordinates = annotate( image_source=image_source, boxes=boxes, logits=logits, phrases=phrases) cv2.circle(annotated_frame, object_coordinates[0][1], 2, (255, 0, 0), 2) cv2.imwrite("detected_objects.png", annotated_frame) return annotated_frame, object_coordinates if __name__ == "__main__": frame, coord = locate_object( "drawer", "https://images.nationalgeographic.org/image/upload/v1638890052/EducationHub/photos/robots-3d-landing-page.jpg") print(coord)