ProteinGPT-Llama3 / minigpt4 /datasets /builders /pdb_text_pair_builder.py
EdwardoSunny's picture
finished
85ab89d
raw
history blame
1.23 kB
import os
import logging
import warnings
from minigpt4.common.registry import registry
from minigpt4.datasets.builders.pdb_base_dataset_builder import PDB_BaseDatasetBuilder
from minigpt4.datasets.datasets.pdb_dataset import ESMDataset
@registry.register_builder("pdb")
class PDBBuilder(PDB_BaseDatasetBuilder):
train_dataset_cls = ESMDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/pdb/pdb.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
storage_path = build_info.storage
datasets = dict()
if not os.path.exists(storage_path):
warnings.warn("storage path {} does not exist.".format(storage_path))
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
text_processor=self.text_processors["train"],
ann_paths=[os.path.join(storage_path, 'filter_cap.json')],
pdb_root=os.path.join(storage_path, 'pdb'),
)
return datasets