pharmapsychotic's picture
CLIP Interrogator 2.0
e900656
raw
history blame
9.46 kB
import sys
sys.path.append('src/blip')
sys.path.append('src/clip')
import clip
import gradio as gr
import hashlib
import io
import IPython
import ipywidgets as widgets
import math
import numpy as np
import os
import pickle
import requests
import torch
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from models.blip import blip_decoder
from PIL import Image
from torch import nn
from torch.nn import functional as F
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Loading BLIP model...")
blip_image_eval_size = 384
blip_model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth'
blip_model = blip_decoder(pretrained=blip_model_url, image_size=blip_image_eval_size, vit='large', med_config='./src/blip/configs/med_config.json')
blip_model.eval()
blip_model = blip_model.to(device)
print("Loading CLIP model...")
clip_model_name = 'ViT-L/14'
clip_model, clip_preprocess = clip.load(clip_model_name, device=device)
clip_model.to(device).eval()
chunk_size = 2048
flavor_intermediate_count = 2048
class LabelTable():
def __init__(self, labels, desc):
self.labels = labels
self.embeds = []
hash = hashlib.sha256(",".join(labels).encode()).hexdigest()
os.makedirs('./cache', exist_ok=True)
cache_filepath = f"./cache/{desc}.pkl"
if desc is not None and os.path.exists(cache_filepath):
with open(cache_filepath, 'rb') as f:
data = pickle.load(f)
if data['hash'] == hash:
self.labels = data['labels']
self.embeds = data['embeds']
if len(self.labels) != len(self.embeds):
self.embeds = []
chunks = np.array_split(self.labels, max(1, len(self.labels)/chunk_size))
for chunk in tqdm(chunks, desc=f"Preprocessing {desc}" if desc else None):
text_tokens = clip.tokenize(chunk).cuda()
with torch.no_grad():
text_features = clip_model.encode_text(text_tokens).float()
text_features /= text_features.norm(dim=-1, keepdim=True)
text_features = text_features.half().cpu().numpy()
for i in range(text_features.shape[0]):
self.embeds.append(text_features[i])
with open(cache_filepath, 'wb') as f:
pickle.dump({"labels":self.labels, "embeds":self.embeds, "hash":hash}, f)
def _rank(self, image_features, text_embeds, top_count=1):
top_count = min(top_count, len(text_embeds))
similarity = torch.zeros((1, len(text_embeds))).to(device)
text_embeds = torch.stack([torch.from_numpy(t) for t in text_embeds]).float().to(device)
for i in range(image_features.shape[0]):
similarity += (image_features[i].unsqueeze(0) @ text_embeds.T).softmax(dim=-1)
_, top_labels = similarity.cpu().topk(top_count, dim=-1)
return [top_labels[0][i].numpy() for i in range(top_count)]
def rank(self, image_features, top_count=1):
if len(self.labels) <= chunk_size:
tops = self._rank(image_features, self.embeds, top_count=top_count)
return [self.labels[i] for i in tops]
num_chunks = int(math.ceil(len(self.labels)/chunk_size))
keep_per_chunk = int(chunk_size / num_chunks)
top_labels, top_embeds = [], []
for chunk_idx in tqdm(range(num_chunks)):
start = chunk_idx*chunk_size
stop = min(start+chunk_size, len(self.embeds))
tops = self._rank(image_features, self.embeds[start:stop], top_count=keep_per_chunk)
top_labels.extend([self.labels[start+i] for i in tops])
top_embeds.extend([self.embeds[start+i] for i in tops])
tops = self._rank(image_features, top_embeds, top_count=top_count)
return [top_labels[i] for i in tops]
def generate_caption(pil_image):
gpu_image = T.Compose([
T.Resize((blip_image_eval_size, blip_image_eval_size), interpolation=TF.InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
])(pil_image).unsqueeze(0).to(device)
with torch.no_grad():
caption = blip_model.generate(gpu_image, sample=False, num_beams=3, max_length=20, min_length=5)
return caption[0]
def load_list(filename):
with open(filename, 'r', encoding='utf-8', errors='replace') as f:
items = [line.strip() for line in f.readlines()]
return items
def rank_top(image_features, text_array):
text_tokens = clip.tokenize([text for text in text_array]).cuda()
with torch.no_grad():
text_features = clip_model.encode_text(text_tokens).float()
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = torch.zeros((1, len(text_array)), device=device)
for i in range(image_features.shape[0]):
similarity += (image_features[i].unsqueeze(0) @ text_features.T).softmax(dim=-1)
_, top_labels = similarity.cpu().topk(1, dim=-1)
return text_array[top_labels[0][0].numpy()]
def similarity(image_features, text):
text_tokens = clip.tokenize([text]).cuda()
with torch.no_grad():
text_features = clip_model.encode_text(text_tokens).float()
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = text_features.cpu().numpy() @ image_features.cpu().numpy().T
return similarity[0][0]
def interrogate(image):
caption = generate_caption(image)
images = clip_preprocess(image).unsqueeze(0).cuda()
with torch.no_grad():
image_features = clip_model.encode_image(images).float()
image_features /= image_features.norm(dim=-1, keepdim=True)
flaves = flavors.rank(image_features, flavor_intermediate_count)
best_medium = mediums.rank(image_features, 1)[0]
best_artist = artists.rank(image_features, 1)[0]
best_trending = trendings.rank(image_features, 1)[0]
best_movement = movements.rank(image_features, 1)[0]
best_prompt = caption
best_sim = similarity(image_features, best_prompt)
def check(addition):
nonlocal best_prompt, best_sim
prompt = best_prompt + ", " + addition
sim = similarity(image_features, prompt)
if sim > best_sim:
best_sim = sim
best_prompt = prompt
return True
return False
def check_multi_batch(opts):
nonlocal best_prompt, best_sim
prompts = []
for i in range(2**len(opts)):
prompt = best_prompt
for bit in range(len(opts)):
if i & (1 << bit):
prompt += ", " + opts[bit]
prompts.append(prompt)
prompt = rank_top(image_features, prompts)
sim = similarity(image_features, prompt)
if sim > best_sim:
best_sim = sim
best_prompt = prompt
check_multi_batch([best_medium, best_artist, best_trending, best_movement])
extended_flavors = set(flaves)
for _ in tqdm(range(25), desc="Flavor chain"):
try:
best = rank_top(image_features, [f"{best_prompt}, {f}" for f in extended_flavors])
flave = best[len(best_prompt)+2:]
if not check(flave):
break
extended_flavors.remove(flave)
except:
# exceeded max prompt length
break
return best_prompt
sites = ['Artstation', 'behance', 'cg society', 'cgsociety', 'deviantart', 'dribble', 'flickr', 'instagram', 'pexels', 'pinterest', 'pixabay', 'pixiv', 'polycount', 'reddit', 'shutterstock', 'tumblr', 'unsplash', 'zbrush central']
trending_list = [site for site in sites]
trending_list.extend(["trending on "+site for site in sites])
trending_list.extend(["featured on "+site for site in sites])
trending_list.extend([site+" contest winner" for site in sites])
raw_artists = load_list('data/artists.txt')
artists = [f"by {a}" for a in raw_artists]
artists.extend([f"inspired by {a}" for a in raw_artists])
artists = LabelTable(artists, "artists")
flavors = LabelTable(load_list('data/flavors.txt'), "flavors")
mediums = LabelTable(load_list('data/mediums.txt'), "mediums")
movements = LabelTable(load_list('data/movements.txt'), "movements")
trendings = LabelTable(trending_list, "trendings")
def inference(image):
return interrogate(image)
inputs = [gr.inputs.Image(type='pil')]
outputs = gr.outputs.Textbox(label="Output")
title = "CLIP Interrogator"
description = "Want to figure out what a good prompt might be to create new images like an existing one? The CLIP Interrogator is here to get you answers!"
article = """
<p>
Example art by <a href="https://pixabay.com/illustrations/watercolour-painting-art-effect-4799014/">Layers</a>
and <a href="https://pixabay.com/illustrations/animal-painting-cat-feline-pet-7154059/">Lin Tong</a>
from pixabay.com
</p>
<p>
Has this been helpful to you? Follow me on twitter
<a href="https://twitter.com/pharmapsychotic">@pharmapsychotic</a>
and check out more tools at my
<a href="https://pharmapsychotic.com/tools.html">Ai generative art tools list</a>
</p>
"""
gr.Interface(
inference,
inputs,
outputs,
title=title, description=description,
article=article,
examples=[['example01.jpg'], ['example02.jpg']]
).launch(enable_queue=True)