anicolson commited on
Commit
95a9c50
1 Parent(s): 6ab63da

Delete lmdb_jpg.py

Browse files
Files changed (1) hide show
  1. lmdb_jpg.py +0 -69
lmdb_jpg.py DELETED
@@ -1,69 +0,0 @@
1
- import multiprocessing
2
-
3
- import duckdb
4
- import lmdb
5
- from torch.utils.data import DataLoader, Dataset
6
- from tqdm import tqdm
7
-
8
- from .dataset import mimic_cxr_image_path
9
-
10
-
11
- class JPGDataset(Dataset):
12
- def __init__(self, df, jpg_path):
13
- self.df = df
14
- self.jpg_path = jpg_path
15
-
16
- def __len__(self):
17
- return len(self.df)
18
-
19
- def __getitem__(self, idx):
20
-
21
- row = self.df.iloc[idx]
22
-
23
- jpg_path = mimic_cxr_image_path(self.jpg_path, row['subject_id'], row['study_id'], row['dicom_id'], 'jpg')
24
-
25
- # Convert key to bytes:
26
- key = bytes(row['dicom_id'], 'utf-8')
27
-
28
- # Read the .jpg file as bytes:
29
- with open(jpg_path, 'rb') as f:
30
- image = f.read()
31
-
32
- return {
33
- 'keys': key,
34
- 'images': image,
35
- }
36
-
37
- def prepare_mimic_cxr_jpg_lmdb(mimic_iv_duckdb_path, mimic_cxr_jpg_dir, mimic_cxr_jpg_lmdb_path, map_size_tb, num_workers=None):
38
-
39
- num_workers = num_workers if num_workers is not None else multiprocessing.cpu_count()
40
-
41
- connect = duckdb.connect(mimic_iv_duckdb_path, read_only=True)
42
- df = connect.sql("SELECT DISTINCT ON(dicom_id) subject_id, study_id, dicom_id FROM mimic_cxr").df()
43
- connect.close()
44
-
45
- # Map size:
46
- map_size = int(map_size_tb * (1024 ** 4))
47
- assert isinstance(map_size, int)
48
-
49
- print(f'Map size: {map_size}')
50
-
51
- dataset = JPGDataset(df, mimic_cxr_jpg_dir)
52
- dataloader = DataLoader(
53
- dataset,
54
- batch_size=num_workers,
55
- shuffle=False,
56
- num_workers=num_workers,
57
- prefetch_factor=1,
58
- collate_fn=lambda x: x,
59
- )
60
-
61
- env = lmdb.open(mimic_cxr_jpg_lmdb_path, map_size=map_size, readonly=False)
62
- for batch in tqdm(dataloader):
63
- for i in batch:
64
- with env.begin(write=True) as txn:
65
- value = txn.get(b'image_keys')
66
- if value is None:
67
- txn.put(i['keys'], i['images'])
68
- env.sync()
69
- env.close()