,\n",
" 'http://127.0.0.1:7862/',\n",
" 'https://13389.gradio.app')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-02-15 18:27:19.011924: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
}
],
"source": [
"import os\n",
"os.system(\"wget https://upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg/1920px-Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg -O starry.jpg\")\n",
"\n",
"from PIL import Image\n",
"import requests\n",
"import torch\n",
"from torchvision import transforms\n",
"from torchvision.transforms.functional import InterpolationMode\n",
"\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"\n",
"\n",
"\n",
" \n",
"#MDETR Code \n",
"import torchvision.transforms as T\n",
"import matplotlib.pyplot as plt\n",
"from collections import defaultdict\n",
"import torch.nn.functional as F\n",
"import numpy as np\n",
"from skimage.measure import find_contours\n",
"\n",
"from matplotlib import patches, lines\n",
"from matplotlib.patches import Polygon\n",
"import gradio as gr\n",
"\n",
"torch.hub.download_url_to_file('https://cdn.pixabay.com/photo/2014/03/04/15/10/elephants-279505_1280.jpg', 'elephant.jpg')\n",
"\n",
"\n",
"model2, postprocessor = torch.hub.load('ashkamath/mdetr:main', 'mdetr_efficientnetB5', pretrained=True, return_postprocessor=True)\n",
"model2 = model2.cpu()\n",
"model2.eval()\n",
"\n",
"\n",
"\n",
"\n",
"torch.set_grad_enabled(False);\n",
"# standard PyTorch mean-std input image normalization\n",
"transform = T.Compose([\n",
" T.Resize(800),\n",
" T.ToTensor(),\n",
" T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n",
"])\n",
"\n",
"# for output bounding box post-processing\n",
"def box_cxcywh_to_xyxy(x):\n",
" x_c, y_c, w, h = x.unbind(1)\n",
" b = [(x_c - 0.5 * w), (y_c - 0.5 * h),\n",
" (x_c + 0.5 * w), (y_c + 0.5 * h)]\n",
" return torch.stack(b, dim=1)\n",
"\n",
"def rescale_bboxes(out_bbox, size):\n",
" img_w, img_h = size\n",
" b = box_cxcywh_to_xyxy(out_bbox)\n",
" b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)\n",
" return b\n",
"# colors for visualization\n",
"COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],\n",
" [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]\n",
"\n",
"def apply_mask(image, mask, color, alpha=0.5):\n",
" \"\"\"Apply the given mask to the image.\n",
" \"\"\"\n",
" for c in range(3):\n",
" image[:, :, c] = np.where(mask == 1,\n",
" image[:, :, c] *\n",
" (1 - alpha) + alpha * color[c] * 255,\n",
" image[:, :, c])\n",
" return image\n",
"\n",
"def plot_results(pil_img, scores, boxes, labels, masks=None):\n",
" plt.figure(figsize=(16,10))\n",
" np_image = np.array(pil_img)\n",
" ax = plt.gca()\n",
" colors = COLORS * 100\n",
" if masks is None:\n",
" masks = [None for _ in range(len(scores))]\n",
" assert len(scores) == len(boxes) == len(labels) == len(masks)\n",
" for s, (xmin, ymin, xmax, ymax), l, mask, c in zip(scores, boxes.tolist(), labels, masks, colors):\n",
" ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,\n",
" fill=False, color=c, linewidth=3))\n",
" text = f'{l}: {s:0.2f}'\n",
" ax.text(xmin, ymin, text, fontsize=15, bbox=dict(facecolor='white', alpha=0.8))\n",
"\n",
" if mask is None:\n",
" continue\n",
" np_image = apply_mask(np_image, mask, c)\n",
"\n",
" padded_mask = np.zeros((mask.shape[0] + 2, mask.shape[1] + 2), dtype=np.uint8)\n",
" padded_mask[1:-1, 1:-1] = mask\n",
" contours = find_contours(padded_mask, 0.5)\n",
" for verts in contours:\n",
" # Subtract the padding and flip (y, x) to (x, y)\n",
" verts = np.fliplr(verts) - 1\n",
" p = Polygon(verts, facecolor=\"none\", edgecolor=c)\n",
" ax.add_patch(p)\n",
"\n",
"\n",
" plt.imshow(np_image)\n",
" plt.axis('off')\n",
" plt.savefig('foo.png',bbox_inches='tight')\n",
" return 'foo.png'\n",
"\n",
"\n",
"def add_res(results, ax, color='green'):\n",
" #for tt in results.values():\n",
" if True:\n",
" bboxes = results['boxes']\n",
" labels = results['labels']\n",
" scores = results['scores']\n",
" #keep = scores >= 0.0\n",
" #bboxes = bboxes[keep].tolist()\n",
" #labels = labels[keep].tolist()\n",
" #scores = scores[keep].tolist()\n",
" #print(torchvision.ops.box_iou(tt['boxes'].cpu().detach(), torch.as_tensor([[xmin, ymin, xmax, ymax]])))\n",
" \n",
" colors = ['purple', 'yellow', 'red', 'green', 'orange', 'pink']\n",
" \n",
" for i, (b, ll, ss) in enumerate(zip(bboxes, labels, scores)):\n",
" ax.add_patch(plt.Rectangle((b[0], b[1]), b[2] - b[0], b[3] - b[1], fill=False, color=colors[i], linewidth=3))\n",
" cls_name = ll if isinstance(ll,str) else CLASSES[ll]\n",
" text = f'{cls_name}: {ss:.2f}'\n",
" print(text)\n",
" ax.text(b[0], b[1], text, fontsize=15, bbox=dict(facecolor='white', alpha=0.8))\n",
"\n",
"\n",
"def plot_inference(im, caption, approaches):\n",
" \n",
" choices = {\"Worker Helmet Separately\" : 1,\"Worker Helmet Vest\":2, \"Workers only\":3}\n",
" \n",
" \n",
"# mean-std normalize the input image (batch-size: 1)\n",
" img = transform(im).unsqueeze(0).cpu()\n",
"\n",
" # propagate through the model\n",
" memory_cache = model2(img, [caption], encode_and_save=True)\n",
" outputs = model2(img, [caption], encode_and_save=False, memory_cache=memory_cache)\n",
"\n",
" # keep only predictions with 0.7+ confidence\n",
" probas = 1 - outputs['pred_logits'].softmax(-1)[0, :, -1].cpu()\n",
" keep = (probas > 0.7).cpu()\n",
"\n",
" # convert boxes from [0; 1] to image scales\n",
" bboxes_scaled = rescale_bboxes(outputs['pred_boxes'].cpu()[0, keep], im.size)\n",
"\n",
" # Extract the text spans predicted by each box\n",
" positive_tokens = (outputs[\"pred_logits\"].cpu()[0, keep].softmax(-1) > 0.1).nonzero().tolist()\n",
" predicted_spans = defaultdict(str)\n",
" for tok in positive_tokens:\n",
" item, pos = tok\n",
" if pos < 255:\n",
" span = memory_cache[\"tokenized\"].token_to_chars(0, pos)\n",
" predicted_spans [item] += \" \" + caption[span.start:span.end]\n",
"\n",
" labels = [predicted_spans [k] for k in sorted(list(predicted_spans .keys()))]\n",
" caption = 'Caption: '+ caption\n",
" return (sepia_call(caption, im, plot_results(im, probas[keep], bboxes_scaled, labels), choices[approaches]))\n",
" \n",
"\n",
"\n",
" \n",
"#BLIP Code\n",
"\n",
"\n",
"from modelsn.blip import blip_decoder\n",
"\n",
"image_size = 384\n",
"transform = transforms.Compose([\n",
" transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),\n",
" transforms.ToTensor(),\n",
" transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))\n",
" ]) \n",
"\n",
"model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth'\n",
" \n",
"model = blip_decoder(pretrained=model_url, image_size=384, vit='base')\n",
"model.eval()\n",
"model = model.to(device)\n",
"\n",
"\n",
"from modelsn.blip_vqa import blip_vqa\n",
"\n",
"image_size_vq = 480\n",
"transform_vq = transforms.Compose([\n",
" transforms.Resize((image_size_vq,image_size_vq),interpolation=InterpolationMode.BICUBIC),\n",
" transforms.ToTensor(),\n",
" transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))\n",
" ]) \n",
"\n",
"model_url_vq = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth'\n",
" \n",
"model_vq = blip_vqa(pretrained=model_url_vq, image_size=480, vit='base')\n",
"model_vq.eval()\n",
"model_vq = model_vq.to(device)\n",
"\n",
"\n",
"\n",
"def inference(raw_image, approaches, question):\n",
" \n",
"\n",
" image = transform(raw_image).unsqueeze(0).to(device) \n",
" with torch.no_grad():\n",
" caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5)\n",
"\n",
" return (plot_inference(raw_image, caption[0], approaches))\n",
" #return 'caption: '+caption[0]\n",
"\n",
" \n",
"\n",
" \n",
"#PPE Detection code\n",
"import numpy as np\n",
"import run_code\n",
"import gradio as gr\n",
" \n",
"\n",
"def sepia_call(caption, Input_Image, MDETR_im, Approach):\n",
" pil_image = Input_Image\n",
" open_cv_image = np.asarray(pil_image)\n",
" sepia_img = run_code.run(open_cv_image, Approach)\n",
" images = sepia_img['img']\n",
" texts= sepia_img['text']\n",
"\n",
" return (caption, MDETR_im, images, texts)\n",
"\n",
"\n",
"inputs = [gr.inputs.Image(type='pil'),gr.inputs.Radio(choices=[\"Worker Helmet Separately\",\"Worker Helmet Vest\", \"Workers only\"], type=\"value\", default=\"Worker Helmet Vest\", label=\"Model\"),\"textbox\"]\n",
"outputs = [gr.outputs.Textbox(label=\"Output\"), \"image\", \"image\", gr.outputs.Textbox(label=\"Output\")]\n",
"\n",
"\n",
"title = \"BLIP + MDETR + PPE Detection\"\n",
"\n",
"description = \"Gradio demo for BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation by Salesforce Research. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below.\"\n",
"\n",
"article = \"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation | Github Repo
\"\n",
"\n",
"\n",
"gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=[['starry.jpg',\"Image Captioning\",\"None\"]]).launch(share=True,enable_queue=True,cache_examples=False)"
]
},
{
"cell_type": "raw",
"id": "b2729aa9",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}