import json import os from tqdm import tqdm import webdataset as wds from utils import MAXCOUNT, NAMING, check_sample import numpy as np PISC_ROOT = "/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/PISC" OUT_DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/instruct/eval/pisc" rel_id_to_type = ["friends", "family", "couple", "professional", "commercial", "no relation"] if __name__ == "__main__": os.makedirs(OUT_DIR, exist_ok=True) annotation_image_info = json.load(open(os.path.join(PISC_ROOT, "annotation_image_info.json"))) relationships = json.load(open(os.path.join(PISC_ROOT, "relationship.json"))) relationship_trainidx = json.load(open(os.path.join(PISC_ROOT, "relationship_split", "relation_trainidx.json"))) relationship_testidx = json.load(open(os.path.join(PISC_ROOT, "relationship_split", "relation_testidx.json"))) data = {} uuid = 0 with wds.ShardWriter(os.path.join(OUT_DIR, NAMING), maxcount=MAXCOUNT**3) as sink: for annotation in tqdm(annotation_image_info): imgH = annotation["imgH"] imgW = annotation["imgW"] id = annotation["id"] bbox = annotation["bbox"] # xyxy if str(id) not in relationships: tqdm.write(f"skip {id} due to not in relationships") continue if str(id) not in relationship_testidx: tqdm.write(f"skip {id} due to not in train set") continue relationship = relationships[str(id)] for rel in relationship: type = rel_id_to_type[relationship[rel] - 1] A_id, B_id = list(map(int, rel.split(" "))) A_box = np.array(bbox[A_id - 1]).astype(float) / np.array([imgW, imgH, imgW, imgH]).astype(float) B_box = np.array(bbox[B_id - 1]).astype(float) / np.array([imgW, imgH, imgW, imgH]).astype(float) data = [A_box, B_box, type] image_path = os.path.join(PISC_ROOT, "image", str(id).zfill(5)+".jpg") dataset = "pisc_relation_split" key = f"{dataset}_{id}_{uuid}" uuid += 1 assert os.path.exists(image_path) sample = { "__key__": key, "image_path.txt": image_path, "dataset.txt": dataset, "data.pyd": data, } check_sample(sample) sink.write(sample)