Spaces:
Runtime error
Runtime error
File size: 3,160 Bytes
6e89871 be3b0b4 6e89871 be3b0b4 75ce42f e3012f6 6e89871 be3b0b4 6e89871 be3b0b4 6e89871 75ce42f 6e89871 be3b0b4 6e89871 be3b0b4 6e89871 be3b0b4 6e89871 be3b0b4 6e89871 be3b0b4 6e89871 75ce42f be3b0b4 75ce42f be3b0b4 75ce42f 6e89871 be3b0b4 6e89871 be3b0b4 6e89871 be3b0b4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import os
import requests
import pandas as pd
import os
import time
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
BING_API_KEY = os.getenv("BING_API_KEY", None)
def get_actor_images(
name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY
):
"""Get a list of actor images from the Bing Image Search API"""
if api_key is None:
raise ValueError("You must provide a Bing API key")
headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY}
query = f'"{name}"'
if role:
query = f"{query} ({role})"
params = {
"q": query,
"count": count,
"imageType": "Photo",
"safeSearch": "Strict",
"imageContent": "Face",
"freshness": "Year",
}
response = requests.get(
f"https://api.bing.microsoft.com/v7.0/images/search",
headers=headers,
params=params,
)
response.raise_for_status()
return response.json()
def read_actors_list(
max_actors: int = None, last_year_active: int = None, sort_by: str = None
):
"""Read and filter the list of actors"""
df = pd.read_csv("data/imdb_actors.csv")
if last_year_active:
df = df[df["lastYear"] >= last_year_active]
if sort_by:
df = df.sort_values(sort_by, ascending=False)
if max_actors:
df = df.head(max_actors)
return df
def store_all_actor_images_data(
max_actors: int = None,
images_per_actor: int = 10,
last_year_active: int = None,
output_file=None,
max_api_calls_per_second: int = 3,
):
"""Get images data for each actor from the Bing Image Search API and store the results as csv"""
df = read_actors_list(max_actors, last_year_active)
df_im = None
if output_file:
try:
df_im = pd.read_csv(output_file)
except:
# file does not exists yet
pass
# remove actors for which we already have images data
if df_im is not None:
df = df[~df["nconst"].isin(df_im["nconst"].unique())]
print(f"Start retrieving images from Bing for {len(df)} actors")
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
try:
images_data = get_actor_images(
name=row["primaryName"], count=images_per_actor
)
except Exception as e:
print(e)
continue
df_im_tmp = pd.DataFrame(images_data["value"])
df_im_tmp["nconst"] = row["nconst"]
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
if df_im is not None:
df_im = pd.concat([df_im, df_im_tmp])
else:
df_im = df_im_tmp
# Store progress
df_im.to_csv(output_file, index=False)
# Limit speed of requests to Bing Search (3 calls per seconds)
time.sleep(1.0 / max_api_calls_per_second)
if __name__ == "__main__":
store_all_actor_images_data(
output_file="data/actors_images_new.csv",
max_actors=2000,
images_per_actor=20,
last_year_active=datetime.now().year - 5,
max_api_calls_per_second=100,
)
|