import webdataset as wds import os from tqdm import tqdm from PIL import Image from io import BytesIO import base64 OUT_DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/vqav2_train_wds" TOTAL = 1828467 if __name__ == "__main__": with wds.ShardWriter(os.path.join(OUT_DIR, "%06d.tar"), maxcount=10000) as sink: sink.verbose = False f = open("/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/vqav2_ofa/vqa_train.tsv") for data in tqdm(f, total=TOTAL): data = data.rstrip().split("\t") id1 = data[0] id2 = data[1] question = data[2] answer = data[3].split("|!+")[-1] image = data[5] id3 = data[6] image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB") caption = f"Question: {question.strip()} Answer: {answer.strip()}" sink.write({"__key__": f"vqav2_{id1}_{id2}_{id3}", "jpg": image, "txt": caption})