File size: 1,917 Bytes
6e89871
 
 
 
 
6631114
6e89871
 
 
be3b0b4
e3012f6
6e89871
 
 
 
be3b0b4
6e89871
 
 
be3b0b4
 
 
6631114
6e89871
 
 
be3b0b4
6e89871
be3b0b4
 
6e89871
 
 
be3b0b4
6e89871
 
 
 
75ce42f
be3b0b4
6e89871
 
6631114
 
75ce42f
6631114
 
 
6e89871
6631114
 
6e89871
 
be3b0b4
6e89871
 
 
be3b0b4
6e89871
e3012f6
be3b0b4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import face_recognition
import requests
import pandas as pd
from io import BytesIO
from tqdm import tqdm
from time import time


def get_image(url: str):
    headers = {"User-Agent": "Actors matching app 1.0"}
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    img_file_object = BytesIO(response.content)
    return face_recognition.load_image_file(img_file_object)


def get_embeddings(url: str):
    try:
        image = get_image(url)
        embeddings = face_recognition.face_encodings(
            image, num_jitters=2, model="large"
        )
        return list(embeddings[0])
    except Exception as e:
        print(e)


def process_all_images(input_file, output_file):
    df = pd.read_csv(input_file)[["nconst", "contentUrl", "resultPosition"]]

    try:
        df_emb = pd.read_csv(output_file)
        df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
    except:
        # file does not exists yet
        df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])

    print(f"Start processing of {df.shape[0]} images")
    df = df.sort_values("resultPosition", ascending=True)
    # df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        embeddings = get_embeddings(row["contentUrl"])
        new_row = row.copy()
        new_row["embeddings"] = embeddings
        new_row = new_row[["nconst", "contentUrl", "embeddings"]]
        df_emb = df_emb.append(new_row, ignore_index=True)

        if i % 5 == 0:
            df_emb.to_csv(output_file, index=False)

    df_emb.to_csv(output_file, index=False)
    return df_emb


def build_annoy_index():
    pass


if __name__ == "__main__":
    output_file = "../data/actors_embeddings.csv"
    df_embeddings = process_all_images(
        input_file="../data/actors_images.csv", output_file=output_file
    )