Spaces:
Runtime error
Runtime error
import os | |
os.system( | |
"wget https://upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg/1920px-Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg -O starry.jpg") | |
from PIL import Image | |
import requests | |
import torch | |
from torchvision import transforms | |
from torchvision.transforms.functional import InterpolationMode | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
# MDETR Code | |
import torchvision.transforms as T | |
import matplotlib.pyplot as plt | |
from collections import defaultdict | |
import torch.nn.functional as F | |
import numpy as np | |
from skimage.measure import find_contours | |
from matplotlib import patches, lines | |
from matplotlib.patches import Polygon | |
import gradio as gr | |
torch.hub.download_url_to_file('https://cdn.pixabay.com/photo/2014/03/04/15/10/elephants-279505_1280.jpg', | |
'elephant.jpg') | |
model2, postprocessor = torch.hub.load('ashkamath/mdetr:main', 'mdetr_efficientnetB5', pretrained=True, | |
return_postprocessor=True) | |
model2 = model2.cpu() | |
model2.eval() | |
torch.set_grad_enabled(False); | |
# standard PyTorch mean-std input image normalization | |
transform = T.Compose([ | |
T.Resize(800), | |
T.ToTensor(), | |
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) | |
]) | |
# for output bounding box post-processing | |
def box_cxcywh_to_xyxy(x): | |
x_c, y_c, w, h = x.unbind(1) | |
b = [(x_c - 0.5 * w), (y_c - 0.5 * h), | |
(x_c + 0.5 * w), (y_c + 0.5 * h)] | |
return torch.stack(b, dim=1) | |
def rescale_bboxes(out_bbox, size): | |
img_w, img_h = size | |
b = box_cxcywh_to_xyxy(out_bbox) | |
b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32) | |
return b | |
# colors for visualization | |
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125], | |
[0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]] | |
def apply_mask(image, mask, color, alpha=0.5): | |
"""Apply the given mask to the image. | |
""" | |
for c in range(3): | |
image[:, :, c] = np.where(mask == 1, | |
image[:, :, c] * | |
(1 - alpha) + alpha * color[c] * 255, | |
image[:, :, c]) | |
return image | |
def plot_results(pil_img, scores, boxes, labels, masks=None): | |
plt.figure(figsize=(16, 10)) | |
np_image = np.array(pil_img) | |
ax = plt.gca() | |
colors = COLORS * 100 | |
if masks is None: | |
masks = [None for _ in range(len(scores))] | |
assert len(scores) == len(boxes) == len(labels) == len(masks) | |
for s, (xmin, ymin, xmax, ymax), l, mask, c in zip(scores, boxes.tolist(), labels, masks, colors): | |
ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, | |
fill=False, color=c, linewidth=3)) | |
text = f'{l}: {s:0.2f}' | |
ax.text(xmin, ymin, text, fontsize=15, bbox=dict(facecolor='white', alpha=0.8)) | |
if mask is None: | |
continue | |
np_image = apply_mask(np_image, mask, c) | |
padded_mask = np.zeros((mask.shape[0] + 2, mask.shape[1] + 2), dtype=np.uint8) | |
padded_mask[1:-1, 1:-1] = mask | |
contours = find_contours(padded_mask, 0.5) | |
for verts in contours: | |
# Subtract the padding and flip (y, x) to (x, y) | |
verts = np.fliplr(verts) - 1 | |
p = Polygon(verts, facecolor="none", edgecolor=c) | |
ax.add_patch(p) | |
plt.imshow(np_image) | |
plt.axis('off') | |
plt.savefig('foo.png', bbox_inches='tight') | |
return 'foo.png' | |
def add_res(results, ax, color='green'): | |
# for tt in results.values(): | |
if True: | |
bboxes = results['boxes'] | |
labels = results['labels'] | |
scores = results['scores'] | |
# keep = scores >= 0.0 | |
# bboxes = bboxes[keep].tolist() | |
# labels = labels[keep].tolist() | |
# scores = scores[keep].tolist() | |
# print(torchvision.ops.box_iou(tt['boxes'].cpu().detach(), torch.as_tensor([[xmin, ymin, xmax, ymax]]))) | |
colors = ['purple', 'yellow', 'red', 'green', 'orange', 'pink'] | |
for i, (b, ll, ss) in enumerate(zip(bboxes, labels, scores)): | |
ax.add_patch(plt.Rectangle((b[0], b[1]), b[2] - b[0], b[3] - b[1], fill=False, color=colors[i], linewidth=3)) | |
cls_name = ll if isinstance(ll, str) else CLASSES[ll] | |
text = f'{cls_name}: {ss:.2f}' | |
print(text) | |
ax.text(b[0], b[1], text, fontsize=15, bbox=dict(facecolor='white', alpha=0.8)) | |
def plot_inference(im, caption, approaches): | |
choices = {"Worker Helmet Separately": 1, "Worker Helmet Vest": 2, "Workers only": 3} | |
# mean-std normalize the input image (batch-size: 1) | |
img = transform(im).unsqueeze(0).cpu() | |
# propagate through the model | |
memory_cache = model2(img, [caption], encode_and_save=True) | |
outputs = model2(img, [caption], encode_and_save=False, memory_cache=memory_cache) | |
# keep only predictions with 0.7+ confidence | |
probas = 1 - outputs['pred_logits'].softmax(-1)[0, :, -1].cpu() | |
keep = (probas > 0.7).cpu() | |
# convert boxes from [0; 1] to image scales | |
bboxes_scaled = rescale_bboxes(outputs['pred_boxes'].cpu()[0, keep], im.size) | |
# Extract the text spans predicted by each box | |
positive_tokens = (outputs["pred_logits"].cpu()[0, keep].softmax(-1) > 0.1).nonzero().tolist() | |
predicted_spans = defaultdict(str) | |
for tok in positive_tokens: | |
item, pos = tok | |
if pos < 255: | |
span = memory_cache["tokenized"].token_to_chars(0, pos) | |
predicted_spans[item] += " " + caption[span.start:span.end] | |
labels = [predicted_spans[k] for k in sorted(list(predicted_spans.keys()))] | |
caption = 'Caption: ' + caption | |
return (sepia_call(caption, im, plot_results(im, probas[keep], bboxes_scaled, labels), choices[approaches])) | |
# BLIP Code | |
from modelsn.blip import blip_decoder | |
image_size = 384 | |
transform = transforms.Compose([ | |
transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), | |
transforms.ToTensor(), | |
transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) | |
]) | |
model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth' | |
model = blip_decoder(pretrained=model_url, image_size=384, vit='base') | |
model.eval() | |
model = model.to(device) | |
from modelsn.blip_vqa import blip_vqa | |
image_size_vq = 480 | |
transform_vq = transforms.Compose([ | |
transforms.Resize((image_size_vq, image_size_vq), interpolation=InterpolationMode.BICUBIC), | |
transforms.ToTensor(), | |
transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) | |
]) | |
model_url_vq = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth' | |
model_vq = blip_vqa(pretrained=model_url_vq, image_size=480, vit='base') | |
model_vq.eval() | |
model_vq = model_vq.to(device) | |
def inference(raw_image, approaches, question): | |
image = transform(raw_image).unsqueeze(0).to(device) | |
with torch.no_grad(): | |
caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5) | |
return (plot_inference(raw_image, caption[0], approaches)) | |
# return 'caption: '+caption[0] | |
# PPE Detection code | |
import numpy as np | |
import run_code | |
import gradio as gr | |
def sepia_call(caption, Input_Image, MDETR_im, Approach): | |
pil_image = Input_Image | |
open_cv_image = np.asarray(pil_image) | |
sepia_img = run_code.run(open_cv_image, Approach) | |
images = sepia_img['img'] | |
texts = sepia_img['text'] | |
return (caption, MDETR_im, images, texts) | |
inputs = [gr.inputs.Image(type='pil'), | |
gr.inputs.Radio(choices=["Worker Helmet Separately", "Worker Helmet Vest", "Workers only"], type="value", | |
default="Worker Helmet Vest", label="Model"), "textbox"] | |
outputs = [gr.outputs.Textbox(label="Output"), "image", "image", gr.outputs.Textbox(label="Output")] | |
title = "BLIP + MDETR + PPE Detection" | |
description = "Gradio demo for BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation by Salesforce Research. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below." | |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2201.12086' target='_blank'>BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation</a> | <a href='https://github.com/salesforce/BLIP' target='_blank'>Github Repo</a></p>" | |
gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, | |
examples=[['starry.jpg', "Image Captioning", "None"]]).launch(share=True, enable_queue=True, | |
cache_examples=False) |