|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
from utils import * |
|
import gradio as gr |
|
from numpy import array |
|
from darknet import Darknet |
|
from torch.autograd import Variable |
|
from torch.cuda import is_available as check_cuda |
|
from PIL.ImageOps import grayscale |
|
from fastai.vision.all import PILImage, load_learner |
|
|
|
|
|
|
|
batch_size = 1 |
|
confidence = 0.25 |
|
nms_thresh = 0.30 |
|
run_cuda = False |
|
|
|
|
|
cfg = 'cfg/yolov3-openimages.cfg' |
|
clsnames= 'cfg/openimages.names' |
|
weights = 'cfg/yolov3-openimages.weights' |
|
|
|
|
|
classes = load_classes(clsnames) |
|
num_classes = len(classes) |
|
|
|
|
|
print('Load Network') |
|
model = Darknet(cfg) |
|
|
|
print('Load Weights') |
|
model.load_weights(weights) |
|
|
|
print('Successfully loaded Network') |
|
|
|
|
|
if run_cuda: |
|
CUDA = check_cuda() |
|
else: |
|
CUDA = False |
|
|
|
|
|
inp_dim = int(model.net_info["height"]) |
|
|
|
|
|
if CUDA: |
|
model.cuda() |
|
|
|
|
|
model.eval() |
|
|
|
def get_detections(x): |
|
c1 = [int(y) for y in x[1:3]] |
|
c2 = [int(y) for y in x[3:5]] |
|
|
|
det_class = int(x[-1]) |
|
label = "{0}".format(classes[det_class]) |
|
|
|
return (label, tuple(c1 + c2)) |
|
|
|
|
|
def detector(image): |
|
|
|
imlist = [image] |
|
loaded_ims = [image] |
|
|
|
im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))])) |
|
im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_ims] |
|
im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2) |
|
|
|
leftover = 0 |
|
if (len(im_dim_list) % batch_size): |
|
leftover = 1 |
|
|
|
if batch_size != 1: |
|
num_batches = len(imlist) // batch_size + leftover |
|
im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size, |
|
len(im_batches))])) for i in range(num_batches)] |
|
|
|
write = 0 |
|
if CUDA: |
|
im_dim_list = im_dim_list.cuda() |
|
|
|
for i, batch in enumerate(im_batches): |
|
|
|
|
|
if CUDA: |
|
batch = batch.cuda() |
|
with torch.no_grad(): |
|
prediction = model(Variable(batch), CUDA) |
|
|
|
prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thresh) |
|
|
|
if type(prediction) == int: |
|
|
|
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): |
|
im_id = i*batch_size + im_num |
|
|
|
continue |
|
|
|
prediction[:,0] += i*batch_size |
|
|
|
if not write: |
|
output = prediction |
|
write = 1 |
|
else: |
|
output = torch.cat((output, prediction)) |
|
|
|
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): |
|
im_id = i * batch_size + im_num |
|
objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id] |
|
|
|
if CUDA: |
|
torch.cuda.synchronize() |
|
|
|
try: |
|
output |
|
except NameError: |
|
return None |
|
|
|
im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long()) |
|
|
|
scaling_factor = torch.min(608/im_dim_list,1)[0].view(-1,1) |
|
|
|
output[:, [1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2 |
|
output[:, [2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2 |
|
|
|
output[:, 1:5] /= scaling_factor |
|
|
|
for i in range(output.shape[0]): |
|
output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0]) |
|
output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1]) |
|
|
|
detections = list(map(get_detections, output)) |
|
|
|
if CUDA: |
|
torch.cuda.empty_cache() |
|
|
|
return loaded_ims[0], detections |
|
|
|
|
|
|
|
|
|
learn_emotion = load_learner('models/emotions_vgg19.pkl') |
|
learn_emotion_labels = learn_emotion.dls.vocab |
|
|
|
|
|
learn_sentiment = load_learner('models/sentiment_vgg19.pkl') |
|
learn_sentiment_labels = learn_sentiment.dls.vocab |
|
|
|
def crop_images(img, bbox): |
|
"Here image should be an image object from PILImage.create" |
|
|
|
|
|
xmin, ymin, xmax, ymax = bbox[1] |
|
|
|
|
|
return img.crop((xmin, ymin, xmax, ymax)) |
|
|
|
|
|
def detect_person_face(img, detections): |
|
'''This function is called from within detect face. |
|
If only a person is detected, then this will crop |
|
image and then try to detect face again.''' |
|
|
|
faces = [] |
|
|
|
|
|
for detection in detections: |
|
|
|
|
|
temp = crop_images(img, detection) |
|
|
|
|
|
_, detect = detector(array(temp)[...,:3]) |
|
|
|
|
|
human_face = [idx for idx, val in enumerate(detect) if val[0] == 'Human face'] |
|
|
|
if len(human_face) == 0: |
|
continue |
|
|
|
|
|
|
|
faces.append(crop_images(temp, detect[human_face[0]])) |
|
|
|
return faces |
|
|
|
|
|
def detect_face(img): |
|
|
|
_, detections = detector(array(img)[...,:3]) |
|
|
|
|
|
human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Human face'] |
|
|
|
if len(human_face) == 0: |
|
human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Person'] |
|
|
|
if len(human_face) == 0: |
|
return None |
|
else: |
|
|
|
faces = detect_person_face(img, [detections[idx] for idx in human_face]) |
|
|
|
else: |
|
|
|
faces = [] |
|
|
|
for idx in human_face: |
|
faces.append(crop_images(img, detections[idx])) |
|
|
|
return faces |
|
|
|
|
|
|
|
def predict(img): |
|
|
|
img = PILImage.create(img) |
|
|
|
|
|
faces = detect_face(img) |
|
|
|
output = [] |
|
|
|
if len(faces) == 0: |
|
|
|
img = img.resize(48, 48) |
|
|
|
pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img))) |
|
|
|
pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img))) |
|
|
|
emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))} |
|
sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))} |
|
|
|
output = [img, emotions, sentiments, None, None, None, None, None, None] |
|
|
|
else: |
|
for face in faces[:3]: |
|
|
|
img = face.resize((48, 48)) |
|
|
|
pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img))) |
|
|
|
pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img))) |
|
|
|
emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))} |
|
sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))} |
|
|
|
output.append(img) |
|
output.append(emotions) |
|
output.append(sentiments) |
|
|
|
temp = output[-3:] |
|
while len(output) < 9: |
|
output = output + temp |
|
|
|
return output |
|
|
|
|
|
title = 'Face Recognition with Emotion and Sentiment Detector' |
|
|
|
description = gr.Markdown( |
|
"""Ever wondered what a person might be feeling looking at their picture? |
|
Well, now you can! Try this fun app. Just upload a facial image in JPG or |
|
PNG format. Voila! you can now see what they might have felt when the picture |
|
was taken. |
|
|
|
This is an updated version of Facial Expression Classifier: |
|
https://huggingface.co/spaces/schibsted/facial_expression_classifier |
|
""").value |
|
|
|
article = gr.Markdown( |
|
"""**DISCLAIMER:** This model does not reveal the actual emotional state of a person. Use and |
|
interpret results at your own risk! It was built as a demo for AI course. Samples images |
|
were downloaded from VG & AftenPosten news webpages. Copyrights belong to respective |
|
brands. All rights reserved. |
|
|
|
**PREMISE:** The idea is to determine an overall sentiment of a news site on a daily basis |
|
based on the pictures. We are restricting pictures to only include close-up facial |
|
images. |
|
|
|
**DATA:** FER2013 dataset consists of 48x48 pixel grayscale images of faces. There are 28,709 |
|
images in the training set and 3,589 images in the test set. However, for this demo all |
|
pictures were combined into a single dataset and 80:20 split was used for training. Images |
|
are assigned one of the 7 emotions: Angry, Disgust, Fear, Happy, Sad, Surprise, and Neutral. |
|
In addition to these 7 classes, images were re-classified into 3 sentiment categories based |
|
on emotions: |
|
|
|
Positive (Happy, Surprise) |
|
|
|
Negative (Angry, Disgust, Fear, Sad) |
|
|
|
Neutral (Neutral) |
|
|
|
FER2013 (preliminary version) dataset can be downloaded at: |
|
https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge/data |
|
|
|
**EMOTION / SENTIMENT MODEL:** VGG19 was used as the base model and trained on FER2013 dataset. Model was trained |
|
using PyTorch and FastAI. Two models were trained, one for detecting emotion and the other |
|
for detecting sentiment. Although, this could have been done with just one model, here two |
|
models were trained for the demo. |
|
|
|
**FACE DETECTOR:** Darknet with YOLOv3 architecture was used for face detection. Reach out to me for full details. |
|
In short, any image is first sent through darknet. If face is detected, then it is passed through emotion/sentiment |
|
model for each face in the picture. If a person is detected rather than a face, the image is cropped and run through |
|
face detector again. If a face is detected, then it is passed through emotion/sentiment model. In case face is not |
|
detected in an image, then the entire image is evaluated to generate some score. This is done because, I couldn't |
|
figure out how to pipe None/blank output to Gradio.Interface(). There maybe option through Gradio.Blocks() but was |
|
too lazy to go through that at this stage. In addition, the output is restricted to only 3 faces in a picture. |
|
""").value |
|
|
|
enable_queue=True |
|
|
|
examples = ['happy1.jpg', 'happy2.jpg', 'angry1.png', 'angry2.jpg', 'neutral1.jpg', 'neutral2.jpg'] |
|
|
|
gr.Interface(fn = predict, |
|
inputs = gr.Image(), |
|
outputs = [gr.Image(shape=(12, 12), label='Person 1'), |
|
gr.Label(label='Emotion - Person 1'), |
|
gr.Label(label='Sentiment - Person 1'), |
|
gr.Image(shape=(12, 12), label='Person 2'), |
|
gr.Label(label='Emotion - Person 2'), |
|
gr.Label(label='Sentiment - Person 2'), |
|
gr.Image(shape=(12, 12), label='Person 3'), |
|
gr.Label(label='Emotion - Person 3'), |
|
gr.Label(label='Sentiment - Person 3'),], |
|
title = title, |
|
examples = examples, |
|
description = description, |
|
article=article, |
|
allow_flagging='never').launch(enable_queue=enable_queue) |