chendl's picture
Add application file
0b7b08a
raw
history blame
867 Bytes
import os
import shutil
import glob
import random
DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw"
OUT_DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/blip2_mini_dataset_full_karpathy"
if __name__ == "__main__":
os.makedirs(OUT_DIR, exist_ok=True)
cc3m_tars = glob.glob(os.path.join(DIR, "cc3m", "cc3m_*", "*.tar"))
cc12m_tars = glob.glob(os.path.join(DIR, "cc12m", "tars", "*.tar"))
coco_tars = glob.glob(os.path.join(DIR, "karpathy_coco_wds_full", "*.tar"))
vg_tars = glob.glob(os.path.join(DIR, "vg_wds_full", "*.tar"))
tars = []
tars.extend(cc3m_tars)
tars.extend(cc12m_tars)
tars.extend(coco_tars)
tars.extend(vg_tars)
random.shuffle(tars)
for i, tar in enumerate(tars):
dst = os.path.join(OUT_DIR, f"{str(i).zfill(6)}.tar")
print(tar, dst)
os.symlink(tar, dst)