Delete lmdb_jpg.py
Browse files- lmdb_jpg.py +0 -69
lmdb_jpg.py
DELETED
@@ -1,69 +0,0 @@
|
|
1 |
-
import multiprocessing
|
2 |
-
|
3 |
-
import duckdb
|
4 |
-
import lmdb
|
5 |
-
from torch.utils.data import DataLoader, Dataset
|
6 |
-
from tqdm import tqdm
|
7 |
-
|
8 |
-
from .dataset import mimic_cxr_image_path
|
9 |
-
|
10 |
-
|
11 |
-
class JPGDataset(Dataset):
|
12 |
-
def __init__(self, df, jpg_path):
|
13 |
-
self.df = df
|
14 |
-
self.jpg_path = jpg_path
|
15 |
-
|
16 |
-
def __len__(self):
|
17 |
-
return len(self.df)
|
18 |
-
|
19 |
-
def __getitem__(self, idx):
|
20 |
-
|
21 |
-
row = self.df.iloc[idx]
|
22 |
-
|
23 |
-
jpg_path = mimic_cxr_image_path(self.jpg_path, row['subject_id'], row['study_id'], row['dicom_id'], 'jpg')
|
24 |
-
|
25 |
-
# Convert key to bytes:
|
26 |
-
key = bytes(row['dicom_id'], 'utf-8')
|
27 |
-
|
28 |
-
# Read the .jpg file as bytes:
|
29 |
-
with open(jpg_path, 'rb') as f:
|
30 |
-
image = f.read()
|
31 |
-
|
32 |
-
return {
|
33 |
-
'keys': key,
|
34 |
-
'images': image,
|
35 |
-
}
|
36 |
-
|
37 |
-
def prepare_mimic_cxr_jpg_lmdb(mimic_iv_duckdb_path, mimic_cxr_jpg_dir, mimic_cxr_jpg_lmdb_path, map_size_tb, num_workers=None):
|
38 |
-
|
39 |
-
num_workers = num_workers if num_workers is not None else multiprocessing.cpu_count()
|
40 |
-
|
41 |
-
connect = duckdb.connect(mimic_iv_duckdb_path, read_only=True)
|
42 |
-
df = connect.sql("SELECT DISTINCT ON(dicom_id) subject_id, study_id, dicom_id FROM mimic_cxr").df()
|
43 |
-
connect.close()
|
44 |
-
|
45 |
-
# Map size:
|
46 |
-
map_size = int(map_size_tb * (1024 ** 4))
|
47 |
-
assert isinstance(map_size, int)
|
48 |
-
|
49 |
-
print(f'Map size: {map_size}')
|
50 |
-
|
51 |
-
dataset = JPGDataset(df, mimic_cxr_jpg_dir)
|
52 |
-
dataloader = DataLoader(
|
53 |
-
dataset,
|
54 |
-
batch_size=num_workers,
|
55 |
-
shuffle=False,
|
56 |
-
num_workers=num_workers,
|
57 |
-
prefetch_factor=1,
|
58 |
-
collate_fn=lambda x: x,
|
59 |
-
)
|
60 |
-
|
61 |
-
env = lmdb.open(mimic_cxr_jpg_lmdb_path, map_size=map_size, readonly=False)
|
62 |
-
for batch in tqdm(dataloader):
|
63 |
-
for i in batch:
|
64 |
-
with env.begin(write=True) as txn:
|
65 |
-
value = txn.get(b'image_keys')
|
66 |
-
if value is None:
|
67 |
-
txn.put(i['keys'], i['images'])
|
68 |
-
env.sync()
|
69 |
-
env.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|