actors_matching / pipeline /get_images_data.py
nbeuchat's picture
black py files
be3b0b4
import os
import requests
import pandas as pd
import os
import time
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
BING_API_KEY = os.getenv("BING_API_KEY", None)
def get_actor_images(
name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY
):
"""Get a list of actor images from the Bing Image Search API"""
if api_key is None:
raise ValueError("You must provide a Bing API key")
headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY}
query = f'"{name}"'
if role:
query = f"{query} ({role})"
params = {
"q": query,
"count": count,
"imageType": "Photo",
"safeSearch": "Strict",
"imageContent": "Face",
"freshness": "Year",
}
response = requests.get(
f"https://api.bing.microsoft.com/v7.0/images/search",
headers=headers,
params=params,
)
response.raise_for_status()
return response.json()
def read_actors_list(
max_actors: int = None, last_year_active: int = None, sort_by: str = None
):
"""Read and filter the list of actors"""
df = pd.read_csv("data/imdb_actors.csv")
if last_year_active:
df = df[df["lastYear"] >= last_year_active]
if sort_by:
df = df.sort_values(sort_by, ascending=False)
if max_actors:
df = df.head(max_actors)
return df
def store_all_actor_images_data(
max_actors: int = None,
images_per_actor: int = 10,
last_year_active: int = None,
output_file=None,
max_api_calls_per_second: int = 3,
):
"""Get images data for each actor from the Bing Image Search API and store the results as csv"""
df = read_actors_list(max_actors, last_year_active)
df_im = None
if output_file:
try:
df_im = pd.read_csv(output_file)
except:
# file does not exists yet
pass
# remove actors for which we already have images data
if df_im is not None:
df = df[~df["nconst"].isin(df_im["nconst"].unique())]
print(f"Start retrieving images from Bing for {len(df)} actors")
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
try:
images_data = get_actor_images(
name=row["primaryName"], count=images_per_actor
)
except Exception as e:
print(e)
continue
df_im_tmp = pd.DataFrame(images_data["value"])
df_im_tmp["nconst"] = row["nconst"]
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
if df_im is not None:
df_im = pd.concat([df_im, df_im_tmp])
else:
df_im = df_im_tmp
# Store progress
df_im.to_csv(output_file, index=False)
# Limit speed of requests to Bing Search (3 calls per seconds)
time.sleep(1.0 / max_api_calls_per_second)
if __name__ == "__main__":
store_all_actor_images_data(
output_file="data/actors_images_new.csv",
max_actors=2000,
images_per_actor=20,
last_year_active=datetime.now().year - 5,
max_api_calls_per_second=100,
)