Spaces:

flava
/

semantic-image-text-search

Runtime error

App Files Files Community

semantic-image-text-search / app.py

aps

Fix memory leak

e733aa9 over 2 years ago

raw

history blame contribute delete

9.45 kB

	from html import escape
	import re
	import torch
	import streamlit as st
	import pandas as pd, numpy as np
	from transformers import CLIPProcessor, CLIPModel, FlavaModel, FlavaProcessor
	from st_clickable_images import clickable_images

	MODEL_NAMES = ["flava-full", "vit-base-patch32", "vit-base-patch16", "vit-large-patch14", "vit-large-patch14-336"]


	@st.cache(allow_output_mutation=True)
	def load():
	df = {0: pd.read_csv("data.csv"), 1: pd.read_csv("data2.csv")}
	models = {}
	processors = {}
	embeddings = {}
	for name in MODEL_NAMES:
	if "flava" not in name:
	model = CLIPModel
	processor = CLIPProcessor
	prefix = "openai/clip-"
	else:
	model = FlavaModel
	processor = FlavaProcessor
	prefix = "facebook/"
	models[name] = model.from_pretrained(f"{prefix}{name}")
	models[name].eval()
	processors[name] = processor.from_pretrained(f"{prefix}{name}")
	embeddings[name] = {
	0: np.load(f"embeddings-{name}.npy"),
	1: np.load(f"embeddings2-{name}.npy"),
	}
	for k in [0, 1]:
	embeddings[name][k] = embeddings[name][k] / np.linalg.norm(
	embeddings[name][k], axis=1, keepdims=True
	)
	return models, processors, df, embeddings


	models, processors, df, embeddings = load()
	source = {0: "\nSource: Unsplash", 1: "\nSource: The Movie Database (TMDB)"}


	def compute_text_embeddings(list_of_strings, name):
	inputs = processors[name](text=list_of_strings, return_tensors="pt", padding=True)
	with torch.no_grad():
	result = models[name].get_text_features(**inputs)
	if "flava" in name:
	result = result[:, 0, :]
	result = result.detach().numpy()
	return result / np.linalg.norm(result, axis=1, keepdims=True)


	def image_search(query, corpus, name, n_results=24):
	positive_embeddings = None

	def concatenate_embeddings(e1, e2):
	if e1 is None:
	return e2
	else:
	return np.concatenate((e1, e2), axis=0)

	splitted_query = query.split("EXCLUDING ")
	dot_product = 0
	k = 0 if corpus == "Unsplash" else 1
	if len(splitted_query[0]) > 0:
	positive_queries = splitted_query[0].split(";")
	for positive_query in positive_queries:
	match = re.match(r"\[(Movies\|Unsplash):(\d{1,5})\](.*)", positive_query)
	if match:
	corpus2, idx, remainder = match.groups()
	idx, remainder = int(idx), remainder.strip()
	k2 = 0 if corpus2 == "Unsplash" else 1
	positive_embeddings = concatenate_embeddings(
	positive_embeddings, embeddings[name][k2][idx : idx + 1, :]
	)
	if len(remainder) > 0:
	positive_embeddings = concatenate_embeddings(
	positive_embeddings, compute_text_embeddings([remainder], name)
	)
	else:
	positive_embeddings = concatenate_embeddings(
	positive_embeddings, compute_text_embeddings([positive_query], name)
	)
	dot_product = embeddings[name][k] @ positive_embeddings.T
	dot_product = dot_product - np.median(dot_product, axis=0)
	dot_product = dot_product / np.max(dot_product, axis=0, keepdims=True)
	dot_product = np.min(dot_product, axis=1)

	if len(splitted_query) > 1:
	negative_queries = (" ".join(splitted_query[1:])).split(";")
	negative_embeddings = compute_text_embeddings(negative_queries, name)
	dot_product2 = embeddings[name][k] @ negative_embeddings.T
	dot_product2 = dot_product2 - np.median(dot_product2, axis=0)
	dot_product2 = dot_product2 / np.max(dot_product2, axis=0, keepdims=True)
	dot_product -= np.max(np.maximum(dot_product2, 0), axis=1)

	results = np.argsort(dot_product)[-1 : -n_results - 1 : -1]
	return [
	(
	df[k].iloc[i]["path"],
	df[k].iloc[i]["tooltip"] + source[k],
	i,
	)
	for i in results
	]


	description = """
	# FLAVA Semantic Image-Text Search
	"""
	instruction= """
	### Enter your query and hit enter

	Things to try: compare with other models or search for "a field in country side EXCLUDING green"
	"""

	credit = """
	Built with FAIR's [FLAVA](https://arxiv.org/abs/2112.04482) models, 🤗 Hugging Face's [transformers library](https://huggingface.co/transformers/), [Streamlit](https://streamlit.io/), 25k images from [Unsplash](https://unsplash.com/) and 8k images from [The Movie Database (TMDB)](https://www.themoviedb.org/)

	Forked and inspired from a similar app available [here](https://huggingface.co/spaces/vivien/clip/)
	"""

	options = """
	## Compare
	Check results for a single model or compare two models by using the dropdown below:
	"""

	howto = """
	## Advanced Use
	- Click on an image to use it as a query and find similar images
	- Several queries, including one based on an image, can be combined (use ";" as a separator).
	- Try "a person walking on a grass field; red flowers".
	- If the input includes "EXCLUDING", text following it will be used as a negative query.
	- Try "a field in country side which is green" and "a field in countryside EXCLUDING green".
	"""

	div_style = {
	"display": "flex",
	"justify-content": "center",
	"flex-wrap": "wrap",
	}


	def main():
	st.markdown(
	"""
	<style>
	.block-container{
	max-width: 1200px;
	}
	div.row-widget.stRadio > div{
	flex-direction:row;
	display: flex;
	justify-content: center;
	}
	div.row-widget.stRadio > div > label{
	margin-left: 5px;
	margin-right: 5px;
	}
	.row-widget {
	margin-top: -25px;
	}
	section>div:first-child {
	padding-top: 30px;
	}
	div.reportview-container > section:first-child{
	max-width: 320px;
	}
	#MainMenu {
	visibility: hidden;
	}
	footer {
	visibility: hidden;
	}
	</style>""",
	unsafe_allow_html=True,
	)

	st.sidebar.markdown(description)
	st.sidebar.markdown(options)
	mode = st.sidebar.selectbox(
	"", ["Results for FLAVA full", "Comparison of 2 models"], index=0
	)
	st.sidebar.markdown(howto)
	st.sidebar.markdown(credit)
	_, c, _ = st.columns((1, 3, 1))
	c.markdown(instruction)
	if "query" in st.session_state:
	query = c.text_input("", value=st.session_state["query"])
	else:
	query = c.text_input("", value="a field in the countryside which is green")
	corpus = st.radio("", ["Unsplash", "Movies"])

	models_dict = {
	"FLAVA": "flava-full",
	"ViT-B/32 (quickest)": "vit-base-patch32",
	"ViT-B/16 (quick)": "vit-base-patch16",
	"ViT-L/14 (slow)": "vit-large-patch14",
	"ViT-L/14@336px (slowest)": "vit-large-patch14-336",
	}

	if "Comparison" in mode:
	c1, c2 = st.columns((1, 1))
	selection1 = c1.selectbox("", models_dict.keys(), index=0)
	selection2 = c2.selectbox("", models_dict.keys(), index=3)
	name1 = models_dict[selection1]
	name2 = models_dict[selection2]
	else:
	name1 = MODEL_NAMES[0]

	if len(query) > 0:
	results1 = image_search(query, corpus, name1)
	if "Comparison" in mode:
	with c1:
	clicked1 = clickable_images(
	[result[0] for result in results1],
	titles=[result[1] for result in results1],
	div_style=div_style,
	img_style={"margin": "2px", "height": "150px"},
	key=query + corpus + name1 + "1",
	)
	results2 = image_search(query, corpus, name2)
	with c2:
	clicked2 = clickable_images(
	[result[0] for result in results2],
	titles=[result[1] for result in results2],
	div_style=div_style,
	img_style={"margin": "2px", "height": "150px"},
	key=query + corpus + name2 + "2",
	)
	else:
	clicked1 = clickable_images(
	[result[0] for result in results1],
	titles=[result[1] for result in results1],
	div_style=div_style,
	img_style={"margin": "2px", "height": "200px"},
	key=query + corpus + name1 + "1",
	)
	clicked2 = -1

	if clicked2 >= 0 or clicked1 >= 0:
	change_query = False
	if "last_clicked" not in st.session_state:
	change_query = True
	else:
	if max(clicked2, clicked1) != st.session_state["last_clicked"]:
	change_query = True
	if change_query:
	if clicked1 >= 0:
	st.session_state["query"] = f"[{corpus}:{results1[clicked1][2]}]"
	elif clicked2 >= 0:
	st.session_state["query"] = f"[{corpus}:{results2[clicked2][2]}]"
	st.experimental_rerun()


	if __name__ == "__main__":
	main()