Spaces:
Sleeping
Sleeping
''' | |
This file is to prepare the dataset in jsonl file | |
''' | |
import os, sys, shutil | |
import json | |
# Import files from the local folder | |
root_path = os.path.abspath('.') | |
sys.path.append(root_path) | |
from curation_pipeline.prepare_bridge_v1 import read_bridge_v1 | |
from curation_pipeline.prepare_bridge_v2 import read_bridge_v2 | |
if __name__ == "__main__": | |
v1_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v1/berkeley" | |
v2_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v2" | |
store_name = "store.jsonl" | |
if os.path.exists(store_name): | |
os.remove(store_name) | |
# Execute | |
full_lists = [] | |
v1_lists = read_bridge_v1(v1_dataset_path, "", copyfile=False) | |
full_lists.extend(v1_lists) | |
v2_lists = read_bridge_v2(v2_dataset_path, "", copyfile=False) | |
full_lists.extend(v2_lists) | |
print("Full length is ", len(full_lists)) | |
with open(store_name, 'w') as outfile: | |
for list_name in full_lists: | |
instance = dict() | |
instance["file_path"] = list_name | |
json.dump(instance, outfile) | |
outfile.write('\n') | |
# with open('output.jsonl', 'w') as outfile: | |
# for entry in JSON_file: | |
# json.dump(entry, outfile) | |
# outfile.write('\n') |