Spaces:
Runtime error
Runtime error
import webdataset as wds | |
import glob | |
import os | |
from tqdm import tqdm | |
from tqdm.contrib.concurrent import process_map | |
import pickle as pkl | |
def single_thread(filename): | |
id_table = {} | |
dataset = wds.WebDataset(filename).decode().to_tuple("json") | |
for data in dataset: | |
data = data[0] | |
image_id = data["caption"].split(".")[0] | |
image_key = data["key"] | |
tarfile = os.path.basename(filename) | |
if image_id not in id_table: | |
id_table[image_id] = [tarfile, image_key] | |
return id_table | |
if __name__ == "__main__": | |
filenames = sorted(glob.glob("/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/mmc4/images/*.tar"))[:16000] | |
print("start from", filenames[0]) | |
print("to", filenames[-1]) | |
id_tables = process_map(single_thread, filenames, max_workers=64) | |
id_table = {} | |
for table in tqdm(id_tables): | |
id_table.update(table) | |
print("total unique image:", len(id_table)) | |
pkl.dump(id_table, open("mmc4_id_table.pkl", "wb")) | |
print("DONE") | |