import face_recognition import requests import pandas as pd from io import BytesIO from tqdm import tqdm from time import time def get_image(url: str): headers = {"User-Agent": "Actors matching app 1.0"} response = requests.get(url, headers=headers) response.raise_for_status() img_file_object = BytesIO(response.content) return face_recognition.load_image_file(img_file_object) def get_embeddings(url: str): try: image = get_image(url) embeddings = face_recognition.face_encodings( image, num_jitters=2, model="large" ) return list(embeddings[0]) except Exception as e: print(e) def process_all_images(input_file, output_file): df = pd.read_csv(input_file)[["nconst", "contentUrl", "resultPosition"]] try: df_emb = pd.read_csv(output_file) df = df[~df["contentUrl"].isin(df_emb["contentUrl"])] except: # file does not exists yet df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"]) print(f"Start processing of {df.shape[0]} images") df = df.sort_values("resultPosition", ascending=True) # df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running for i, row in tqdm(df.iterrows(), total=df.shape[0]): embeddings = get_embeddings(row["contentUrl"]) new_row = row.copy() new_row["embeddings"] = embeddings new_row = new_row[["nconst", "contentUrl", "embeddings"]] df_emb = df_emb.append(new_row, ignore_index=True) if i % 5 == 0: df_emb.to_csv(output_file, index=False) df_emb.to_csv(output_file, index=False) return df_emb def build_annoy_index(): pass if __name__ == "__main__": output_file = "../data/actors_embeddings.csv" df_embeddings = process_all_images( input_file="../data/actors_images.csv", output_file=output_file )