Spaces:
Runtime error
Runtime error
File size: 1,917 Bytes
6e89871 6631114 6e89871 be3b0b4 e3012f6 6e89871 be3b0b4 6e89871 be3b0b4 6631114 6e89871 be3b0b4 6e89871 be3b0b4 6e89871 be3b0b4 6e89871 75ce42f be3b0b4 6e89871 6631114 75ce42f 6631114 6e89871 6631114 6e89871 be3b0b4 6e89871 be3b0b4 6e89871 e3012f6 be3b0b4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import face_recognition
import requests
import pandas as pd
from io import BytesIO
from tqdm import tqdm
from time import time
def get_image(url: str):
headers = {"User-Agent": "Actors matching app 1.0"}
response = requests.get(url, headers=headers)
response.raise_for_status()
img_file_object = BytesIO(response.content)
return face_recognition.load_image_file(img_file_object)
def get_embeddings(url: str):
try:
image = get_image(url)
embeddings = face_recognition.face_encodings(
image, num_jitters=2, model="large"
)
return list(embeddings[0])
except Exception as e:
print(e)
def process_all_images(input_file, output_file):
df = pd.read_csv(input_file)[["nconst", "contentUrl", "resultPosition"]]
try:
df_emb = pd.read_csv(output_file)
df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
except:
# file does not exists yet
df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
print(f"Start processing of {df.shape[0]} images")
df = df.sort_values("resultPosition", ascending=True)
# df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
embeddings = get_embeddings(row["contentUrl"])
new_row = row.copy()
new_row["embeddings"] = embeddings
new_row = new_row[["nconst", "contentUrl", "embeddings"]]
df_emb = df_emb.append(new_row, ignore_index=True)
if i % 5 == 0:
df_emb.to_csv(output_file, index=False)
df_emb.to_csv(output_file, index=False)
return df_emb
def build_annoy_index():
pass
if __name__ == "__main__":
output_file = "../data/actors_embeddings.csv"
df_embeddings = process_all_images(
input_file="../data/actors_images.csv", output_file=output_file
)
|