Spaces:
Runtime error
Runtime error
import os | |
import requests | |
import pandas as pd | |
import os | |
import time | |
from datetime import datetime | |
from tqdm import tqdm | |
from dotenv import load_dotenv | |
load_dotenv() | |
BING_API_KEY = os.getenv("BING_API_KEY", None) | |
def get_actor_images( | |
name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY | |
): | |
"""Get a list of actor images from the Bing Image Search API""" | |
if api_key is None: | |
raise ValueError("You must provide a Bing API key") | |
headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY} | |
query = f'"{name}"' | |
if role: | |
query = f"{query} ({role})" | |
params = { | |
"q": query, | |
"count": count, | |
"imageType": "Photo", | |
"safeSearch": "Strict", | |
"imageContent": "Face", | |
"freshness": "Year", | |
} | |
response = requests.get( | |
f"https://api.bing.microsoft.com/v7.0/images/search", | |
headers=headers, | |
params=params, | |
) | |
response.raise_for_status() | |
return response.json() | |
def read_actors_list( | |
max_actors: int = None, last_year_active: int = None, sort_by: str = None | |
): | |
"""Read and filter the list of actors""" | |
df = pd.read_csv("data/imdb_actors.csv") | |
if last_year_active: | |
df = df[df["lastYear"] >= last_year_active] | |
if sort_by: | |
df = df.sort_values(sort_by, ascending=False) | |
if max_actors: | |
df = df.head(max_actors) | |
return df | |
def store_all_actor_images_data( | |
max_actors: int = None, | |
images_per_actor: int = 10, | |
last_year_active: int = None, | |
output_file=None, | |
max_api_calls_per_second: int = 3, | |
): | |
"""Get images data for each actor from the Bing Image Search API and store the results as csv""" | |
df = read_actors_list(max_actors, last_year_active) | |
df_im = None | |
if output_file: | |
try: | |
df_im = pd.read_csv(output_file) | |
except: | |
# file does not exists yet | |
pass | |
# remove actors for which we already have images data | |
if df_im is not None: | |
df = df[~df["nconst"].isin(df_im["nconst"].unique())] | |
print(f"Start retrieving images from Bing for {len(df)} actors") | |
for _, row in tqdm(df.iterrows(), total=df.shape[0]): | |
try: | |
images_data = get_actor_images( | |
name=row["primaryName"], count=images_per_actor | |
) | |
except Exception as e: | |
print(e) | |
continue | |
df_im_tmp = pd.DataFrame(images_data["value"]) | |
df_im_tmp["nconst"] = row["nconst"] | |
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp))) | |
if df_im is not None: | |
df_im = pd.concat([df_im, df_im_tmp]) | |
else: | |
df_im = df_im_tmp | |
# Store progress | |
df_im.to_csv(output_file, index=False) | |
# Limit speed of requests to Bing Search (3 calls per seconds) | |
time.sleep(1.0 / max_api_calls_per_second) | |
if __name__ == "__main__": | |
store_all_actor_images_data( | |
output_file="data/actors_images_new.csv", | |
max_actors=2000, | |
images_per_actor=20, | |
last_year_active=datetime.now().year - 5, | |
max_api_calls_per_second=100, | |
) | |