File size: 983 Bytes
6e89871
 
 
 
 
 
be3b0b4
6e89871
 
 
 
be3b0b4
6e89871
 
 
 
 
 
 
 
be3b0b4
6e89871
 
 
be3b0b4
 
6e89871
be3b0b4
6e89871
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import os
import gzip
import shutil
from urllib.request import urlretrieve
from tqdm import tqdm


def download_large_file(url: str, output_file: str):
    if not os.path.exists(output_file):
        urlretrieve(url, output_file)


def unzip_file(input_file):
    output_file = os.path.splitext(input_file)[0]
    if not os.path.exists(output_file):
        with gzip.open(input_file, "rb") as f_in:
            # Input file has the format xxx.tsv.gz
            with open(output_file, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)


if __name__ == "__main__":
    imdb_url = "https://datasets.imdbws.com"
    filenames = [
        "name.basics.tsv.gz",
        "title.basics.tsv.gz",
        "title.ratings.tsv.gz",
        "title.principals.tsv.gz",
    ]
    for filename in tqdm(filenames):
        url = f"{imdb_url}/{filename}"
        output_file = os.path.join("data", filename)
        download_large_file(url, output_file)
        unzip_file(output_file)