File size: 15,228 Bytes
ff8e6c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
import os
import random
from pathlib import Path
from typing import Callable, List, Tuple, Optional, Iterable, Dict, Union
from typing_extensions import TypedDict, Unpack, Required, NotRequired

import numpy as np
import pandas as pd
import torch
import torchvision.transforms.functional as TF
import wandb
from torch.utils.data import Dataset
import torch.nn as nn


def format_mrn(mrn):
    return str(mrn).strip().zfill(20)


class CedarsDatasetTypeAnnotations(TypedDict, total=False):
    """A dummy class used to make IDE autocomplete and tooltips work properly with how we pass **kwargs through in subclasses of CedarsDataset."""
    data_path: Required[Union[Path, str]]
    manifest_path: Required[Union[Path, str]]
    split: NotRequired[str]
    labels: NotRequired[Iterable[str]]
    extra_inputs: NotRequired[Iterable[str]]
    update_manifest_func: NotRequired[Callable[[pd.DataFrame], pd.DataFrame]]
    subsample: NotRequired[Union[Path, str]]
    augmentations: NotRequired[Union[Iterable[Callable[[torch.Tensor], torch.Tensor]], Callable[[dict], dict], nn.Module]]
    apply_augmentations_to: NotRequired[Iterable[str]]
    verify_existing: NotRequired[bool]
    drop_na_labels: NotRequired[bool]
    verbose: NotRequired[bool]


class CedarsDataset(Dataset):
    """
    Generic parent class for several differnet kinds of common datasets we use here at Cedars CVAIR.

    Expects to be used in a scenario where you have a big folder full of input examples (videos, ecgs, 3d arrays, images, etc.) and a big CSV that contains metadata and labels for those examples, called a 'manifest'.

    Args:
        data_path: Path to a directory full of files you want the dataset to load from.
        manifest_path: Path to a CSV or Parquet file containing the names, labels, and/or metadata of your files.
        split: Optional. Allows user to select which split of the manifest to use, assuming the presence of a categorical 'split' column. Defaults to None, meaning that the entire manifest is used by default.
        extra_inputs: Optional. A list of column names in the manifest that contain additional inputs to the model. Defaults to None.
        labels: Optional. Name(s) of column(s) in your manifest which contain training labels, in the order you want them returned. If set to None, the dataset will not return any labels, only filenames and inputs. Defaults to None.
        update_manifest_func: Optional. Allows user to pass in a function to preprocess the manifest after it is loaded, but before the dataset does anything to it.
        subsample: Optional. A number indicating how many examples to randomly subsample from the manifest. Defaults to None.
        verbose: Whether to print out progress statements when initializing. Defaults to True.
        augmentations: Optional. Can be a list of augmentation functions which take in a tensor and return a tensor, a single custom augmentation function which takes in a dict and returns a dict, or a single nn.Module. Defaults to None.
        apply_augmentations_to: Optional. A list of strings indicating which batch elements to apply augmentations to. Defaults to ("primary_input").
    """

    def __init__(
        self,
        data_path,
        manifest_path=None,
        split=None,
        labels=None,
        extra_inputs=None,
        update_manifest_func=None,
        subsample=None,
        augmentations=None,
        apply_augmentations_to=("primary_input",),
        verify_existing=True,
        drop_na_labels=True,
        verbose=True,
    ):

        self.data_path = Path(data_path)
        self.augmentations = augmentations
        self.apply_augmentations_to = apply_augmentations_to
        self.extra_inputs = extra_inputs
        self.labels = labels

        if isinstance(self.augmentations, nn.Module):
            self.augmentations = [self.augmentations]

        if (self.labels is None) and verbose:
            print(
                "No label column names were provided, only filenames and inputs will be returned."
            )
        if (self.labels is not None) and isinstance(self.labels, str):
            self.labels = [self.labels]
        if (self.extra_inputs is not None) and isinstance(self.extra_inputs, str):
            self.extra_inputs = [self.extra_inputs]

        # Read manifest file
        if manifest_path is not None:
            self.manifest_path = Path(manifest_path)
        else:
            self.manifest_path = self.data_path / "manifest.csv"

        if self.manifest_path.exists():
            if self.manifest_path.suffix == ".csv":
                self.manifest = pd.read_csv(self.manifest_path, low_memory=False)
            elif self.manifest_path.suffix == ".parquet":
                self.manifest = pd.read_parquet(self.manifest_path)
        else:
            self.manifest = pd.DataFrame(
                {
                    "filename": os.listdir(self.data_path),
                }
            )

        # do manifest processing that's specific to a given task (different from update_manifest_func,
        # exists as a method overridden in child classes)
        self.manifest = self.process_manifest(self.manifest)

        # Apply user-provided update function to manifest
        if update_manifest_func is not None:
            self.manifest = update_manifest_func(self, self.manifest)

        # Usually set to "train", "val", or "test". If set to None, the entire manifest is used.
        if split is not None:
            self.manifest = self.manifest[self.manifest["split"] == split]
        if verbose:
            print(
                f"Manifest loaded. \nSplit: {split}\nLength: {len(self.manifest):,}"
            )

        # Make sure all files actually exist. This can be disabled for efficiency if
        # you have an especially large dataset
        if verify_existing and "filename" in self.manifest:
            old_len = len(self.manifest)
            existing_files = os.listdir(self.data_path)
            self.manifest = self.manifest[
                self.manifest["filename"].isin(existing_files)
            ]
            new_len = len(self.manifest)
            if verbose:
                print(
                    f"{old_len - new_len} files in the manifest are missing from {self.data_path}."
                )
        elif (not verify_existing) and verbose:
            print(
                f"self.verify_existing is set to False, so it's possible for the manifest to contain filenames which are not present in {data_path}"
            )

        # Option to subsample dataset for doing smaller, faster runs
        if subsample is not None:
            if isinstance(subsample, int):
                self.manifest = self.manifest.sample(n=subsample)
            else:
                self.manifest = self.manifest.sample(frac=subsample)
            if verbose:
                print(f"{subsample} examples subsampled.")

        # Make sure that there are no NAN labels
        if (self.labels is not None) and drop_na_labels:
            old_len = len(self.manifest)
            self.manifest = self.manifest.dropna(subset=self.labels)
            new_len = len(self.manifest)
            if verbose:
                print(
                    f"{old_len - new_len} examples contained NaN value(s) in their labels and were dropped."
                )
        elif (self.labels is not None) and (not drop_na_labels):
            print(
                "drop_na_labels is set to False, so it's possible for the manifest to contain NaN-valued labels."
            )

        # Save manifest to weights and biases run directory
        if wandb.run is not None:
            run_data_path = Path(wandb.run.dir).parent / "data"
            if not run_data_path.is_dir():
                run_data_path.mkdir()

            save_name = "manifest.csv"
            if split is not None:
                save_name = f"{split}_{save_name}"

            self.manifest.to_csv(run_data_path / save_name)

            if verbose:
                print(f"Copy of manifest saved to {run_data_path}")

    def __len__(self) -> int:
        return len(self.manifest)

    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
        output = {}
        row = self.manifest.iloc[index]
        if "filename" in row:
            output["filename"] = row["filename"]
        if self.labels is not None:
            output["labels"] = torch.FloatTensor(row[self.labels])
        file_results = self.read_file(self.data_path / output["filename"], row)
        if isinstance(file_results, dict):
            output.update(file_results)
        else:
            output["primary_input"] = file_results

        if self.extra_inputs is not None:
            output["extra_inputs"] = row["extra_inputs"]

        if self.augmentations is not None:
            output = self.augment(output)

        return output

    def process_manifest(self, manifest: pd.DataFrame) -> pd.DataFrame:
        if "mrn" in manifest.columns:
            manifest["mrn"] = manifest["mrn"].apply(format_mrn)
        if "study_date" in manifest.columns:
            manifest["study_date"] = pd.to_datetime(manifest["study_date"])
        if "dob" in manifest.columns:
            manifest["dob"] = pd.to_datetime(
                manifest["dob"], infer_datetime_format=True, errors="coerce"
            )
        if ("study_date" in manifest.columns) and ("dob" in manifest.columns):
            manifest["study_age"] = (
                manifest["study_date"] - manifest["dob"]
            ) / np.timedelta64(1, "Y")
        return manifest

    def augment(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:

        if isinstance(self.augmentations, Iterable):
            # would use torch.stack here for cleanliness, but it seems that torchvision
            # transforms v1's claims about supporting "arbitrary leading dimensions" is
            # hogwash. they only support up to 4D. so we have to concatenate along the
            # channel dimension, then apply the augmentations, then split along the channel
            # dimension.
            augmentable_inputs = torch.cat(
                [output_dict[key] for key in self.apply_augmentations_to], dim=0
            )  # (C*N, T, H, W)

            for aug in self.augmentations:
                augmentable_inputs = aug(augmentable_inputs)

            place = 0
            for i, key in enumerate(self.apply_augmentations_to):
                n_channels = output_dict[key].shape[0]
                output_dict[key] = augmentable_inputs[place:place+n_channels]
                place += n_channels

        elif isinstance(self.augmentations, Callable):
            output_dict = self.augmentations(output_dict)

        else:
            raise Exception(
                "self.augmentations must be either an Iterable of augmentations or a single custom augmentation function."
            )

        return output_dict

    def read_file(self, filepath: Path, row: Optional[pd.Series] = None) -> torch.Tensor:
        raise NotImplementedError


class ECGDataset(CedarsDataset):
    def __init__(
        self,
        # CedarsDataset params
        data_path: Union[Path, str],
        manifest_path: Union[Path, str] = None,
        split: str = None,
        labels: Union[List[str], str] = None,
        update_manifest_func: Callable = None,
        subsample: float = None,
        verbose: bool = True,
        verify_existing: bool = True,
        drop_na_labels: bool = True,
        # ECGoDataset params
        leads: List[str] = None,
        random_lead: bool = False,  # New parameter for random lead selection
        data_length: int = 5000,
        **kwargs,
    ):
        """
        Args:
            leads: List[str] -- which leads you want passed to the model. Defaults to all 12.
        """

        super().__init__(
            data_path=data_path,
            manifest_path=manifest_path,
            split=split,
            labels=labels,
            update_manifest_func=update_manifest_func,
            subsample=subsample,
            verbose=verbose,
            verify_existing=verify_existing,
            drop_na_labels=drop_na_labels,
            **kwargs,
        )

        self.lead_order = [
            "I",
            "II",
            "III",
            "aVR",
            "aVL",
            "aVF",
            "V1",
            "V2",
            "V3",
            "V4",
            "V5",
            "V6",
        ]
        self.leads = leads
        if self.leads is None:
            self.leads = self.lead_order
        if isinstance(self.leads, str):
            self.leads = [self.leads]

        if "first_lead_only" in kwargs:
            raise (
                Exception(
                    '"first_lead_only" has been deprecated. Please pass leads=["I"] \
                    instead if you would like to train on only the first lead.'
                )
            )
        
        self.random_lead = random_lead  # Storing the random_lead attribute
        
        self.data_length = data_length


    def read_file(self, filepath, row=None):
        # ECGs are usually stored as .npy files.
        file = np.load(filepath)
        if file.shape[0] != 12:
            file = file.T
        file = torch.tensor(file).float()
        
        # Slice the data to the specified length
        file = file[:, :self.data_length]
        
        if self.random_lead:
            lead_idx = random.choice(range(12))
            file = file[lead_idx:lead_idx+1]  # Select the random lead
        else:
            channels = [self.lead_order.index(lead) for lead in self.leads]
            file = file[channels]


        # Final shape should ideally be NumLeadsxTime(or NumLeadsxTime depending on the resolution of the ECG)
        return file


class ECGSingleLeadDataset(CedarsDataset):
    def __init__(
        self,
        # CedarsDataset params
        data_path: Union[Path, str],
        manifest_path: Union[Path, str] = None,
        labels: Union[List[str], str] = None,
        update_manifest_func: Callable = None,
        subsample: float = None,
        verbose: bool = True,
        verify_existing: bool = True,
        drop_na_labels: bool = True,
        **kwargs,
    ):
        """
        Args:
            leads: List[str] -- which leads you want passed to the model. Defaults to all 12.
        """

        super().__init__(
            data_path=data_path,
            manifest_path=manifest_path,
            labels=labels,
            update_manifest_func=update_manifest_func,
            subsample=subsample,
            verbose=verbose,
            verify_existing=verify_existing,
            drop_na_labels=drop_na_labels,
            **kwargs,
        )


    def read_file(self, filepath, row=None):
        # ECGs are usually stored as .npy files.
        try:
            file = np.load(filepath)
        except Exception as e:
            print(filepath)
            print(e)

        file = torch.tensor(file).float().unsqueeze(0)


        # Final shape should ideally be NumLeadsxTime(or NumLeadsxTime depending on the resolution of the ECG)
        return file