aehrc
/

cxrmate-ed

Model card Files Files and versions Community

anicolson commited on about 21 hours ago

Commit

9691248

•

1 Parent(s): 1c38939

Upload model

Browse files

Files changed (11) hide show

config.json +14 -55
configuration_cxrmate_ed.py +39 -0
configuration_uniformer.py +51 -0
create_section_files.py +7 -5
dataset.py +70 -241
generation_config.json +1 -1
model.safetensors +2 -2
modelling_cxrmate_ed.py +412 -337
modelling_uniformer.py +4 -4
prepare_dataset.py +558 -0
utils.py +20 -0

config.json CHANGED Viewed

@@ -21,11 +21,6 @@
     "diversity_penalty": 0.0,
     "do_sample": false,
     "early_stopping": false,
-    "ed_module_columns": [
-      "triage_chiefcomplaint",
-      "triage_pain",
-      "vitalsign_pain"
-    ],
     "encoder_no_repeat_ngram_size": 0,
     "eos_token_id": 2,
     "exponential_decay_length_penalty": null,
@@ -34,16 +29,12 @@
     "forced_eos_token_id": null,
     "hidden_act": "silu",
     "hidden_size": 768,
     "id2label": {
       "0": "LABEL_0",
       "1": "LABEL_1"
     },
     "include_time_delta": true,
-    "index_value_encoder_config": {
-      "edstays": 40,
-      "triage": 7,
-      "vitalsign": 1177
-    },
     "index_value_encoder_intermediate_size": 2048,
     "initializer_range": 0.02,
     "intermediate_size": 3072,
@@ -56,10 +47,6 @@
     "length_penalty": 1.0,
     "max_length": 20,
     "max_position_embeddings": 2048,
-    "mimic_cxr_columns": [
-      "indication",
-      "history"
-    ],
     "min_length": 0,
     "model_type": "llama",
     "no_repeat_ngram_size": 0,
@@ -69,7 +56,6 @@
     "num_hidden_layers": 6,
     "num_key_value_heads": 12,
     "num_return_sequences": 1,
-    "num_token_types": 19,
     "output_attentions": false,
     "output_hidden_states": false,
     "output_scores": false,
@@ -77,6 +63,10 @@
     "prefix": null,
     "pretraining_tp": 1,
     "problem_type": null,
     "pruned_heads": {},
     "remove_invalid_values": false,
     "repetition_penalty": 1.0,
@@ -85,39 +75,19 @@
     "rms_norm_eps": 1e-06,
     "rope_scaling": null,
     "rope_theta": 10000.0,
-    "section_ids": [
-      12,
-      13
-    ],
     "sep_token_id": null,
     "suppress_tokens": null,
     "task_specific_params": null,
     "temperature": 1.0,
     "tf_legacy_loss": false,
     "tie_encoder_decoder": false,
     "tie_word_embeddings": false,
     "time_delta_monotonic_inversion": true,
-    "token_type_to_token_type_id": {
-      "comparison": 15,
-      "edstays": 1,
-      "findings": 12,
-      "history": 11,
-      "image": 14,
-      "impression": 13,
-      "indication": 10,
-      "medrecon": 0,
-      "medrecon_name": 6,
-      "mimic_cxr_2_0_0_metadata": 5,
-      "previous_findings": 16,
-      "previous_image": 18,
-      "previous_impression": 17,
-      "pyxis": 4,
-      "triage": 2,
-      "triage_chiefcomplaint": 7,
-      "triage_pain": 8,
-      "vitalsign": 3,
-      "vitalsign_pain": 9
-    },
     "tokenizer_class": null,
     "top_k": 50,
     "top_p": 1.0,
@@ -126,14 +96,12 @@
     "typical_p": 1.0,
     "use_bfloat16": false,
     "use_cache": true,
-    "vocab_size": 30000,
-    "zero_time_delta_value": 1.0
   },
   "encoder": {
     "_name_or_path": "",
     "add_cross_attention": false,
     "architectures": null,
-    "attention_probs_dropout_prob": 0.0,
     "attn_drop_rate": 0.0,
     "bad_words_ids": null,
     "begin_suppress_tokens": null,
@@ -160,24 +128,18 @@
       512
     ],
     "encoder_no_repeat_ngram_size": 0,
-    "encoder_stride": 16,
     "eos_token_id": null,
     "exponential_decay_length_penalty": null,
     "finetuning_task": null,
     "forced_bos_token_id": null,
     "forced_eos_token_id": null,
     "head_dim": 64,
-    "hidden_act": "gelu",
-    "hidden_dropout_prob": 0.0,
-    "hidden_size": 768,
     "id2label": {
       "0": "LABEL_0",
       "1": "LABEL_1"
     },
     "image_size": 384,
     "in_chans": 3,
-    "initializer_range": 0.02,
-    "intermediate_size": 3072,
     "is_decoder": false,
     "is_encoder_decoder": false,
     "label2id": {
@@ -189,14 +151,11 @@
     "max_length": 20,
     "min_length": 0,
     "mlp_ratio": 4,
-    "model_type": "vit",
     "no_repeat_ngram_size": 0,
-    "num_attention_heads": 12,
     "num_beam_groups": 1,
     "num_beams": 1,
-    "num_channels": 3,
     "num_classes": 1000,
-    "num_hidden_layers": 12,
     "num_return_sequences": 1,
     "output_attentions": false,
     "output_hidden_states": false,
@@ -234,8 +193,8 @@
     "typical_p": 1.0,
     "use_bfloat16": false
   },
-  "model_type": "vision-encoder-decoder",
   "tie_word_embeddings": false,
   "torch_dtype": "float32",
-  "transformers_version": "4.40.2"
 }

     "diversity_penalty": 0.0,
     "do_sample": false,
     "early_stopping": false,
     "encoder_no_repeat_ngram_size": 0,
     "eos_token_id": 2,
     "exponential_decay_length_penalty": null,
     "forced_eos_token_id": null,
     "hidden_act": "silu",
     "hidden_size": 768,
+    "history": 0,
     "id2label": {
       "0": "LABEL_0",
       "1": "LABEL_1"
     },
     "include_time_delta": true,
     "index_value_encoder_intermediate_size": 2048,
     "initializer_range": 0.02,
     "intermediate_size": 3072,
     "length_penalty": 1.0,
     "max_length": 20,
     "max_position_embeddings": 2048,
     "min_length": 0,
     "model_type": "llama",
     "no_repeat_ngram_size": 0,
     "num_hidden_layers": 6,
     "num_key_value_heads": 12,
     "num_return_sequences": 1,
     "output_attentions": false,
     "output_hidden_states": false,
     "output_scores": false,
     "prefix": null,
     "pretraining_tp": 1,
     "problem_type": null,
+    "prompt_report_sections_filter": [
+      "indication",
+      "history"
+    ],
     "pruned_heads": {},
     "remove_invalid_values": false,
     "repetition_penalty": 1.0,
     "rms_norm_eps": 1e-06,
     "rope_scaling": null,
     "rope_theta": 10000.0,
     "sep_token_id": null,
     "suppress_tokens": null,
+    "tables_filter": [
+      "mimic_cxr_sectioned",
+      "triage",
+      "medrecon"
+    ],
     "task_specific_params": null,
     "temperature": 1.0,
     "tf_legacy_loss": false,
     "tie_encoder_decoder": false,
     "tie_word_embeddings": false,
     "time_delta_monotonic_inversion": true,
     "tokenizer_class": null,
     "top_k": 50,
     "top_p": 1.0,
     "typical_p": 1.0,
     "use_bfloat16": false,
     "use_cache": true,
+    "vocab_size": 30000
   },
   "encoder": {
     "_name_or_path": "",
     "add_cross_attention": false,
     "architectures": null,
     "attn_drop_rate": 0.0,
     "bad_words_ids": null,
     "begin_suppress_tokens": null,
       512
     ],
     "encoder_no_repeat_ngram_size": 0,
     "eos_token_id": null,
     "exponential_decay_length_penalty": null,
     "finetuning_task": null,
     "forced_bos_token_id": null,
     "forced_eos_token_id": null,
     "head_dim": 64,
     "id2label": {
       "0": "LABEL_0",
       "1": "LABEL_1"
     },
     "image_size": 384,
     "in_chans": 3,
     "is_decoder": false,
     "is_encoder_decoder": false,
     "label2id": {
     "max_length": 20,
     "min_length": 0,
     "mlp_ratio": 4,
+    "model_type": "uniformer",
     "no_repeat_ngram_size": 0,
     "num_beam_groups": 1,
     "num_beams": 1,
     "num_classes": 1000,
     "num_return_sequences": 1,
     "output_attentions": false,
     "output_hidden_states": false,
     "typical_p": 1.0,
     "use_bfloat16": false
   },
+  "model_type": "encoder-decoder",
   "tie_word_embeddings": false,
   "torch_dtype": "float32",
+  "transformers_version": "4.39.3"
 }

configuration_cxrmate_ed.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class EncoderDecoderConfig(PretrainedConfig):
+    model_type = "encoder-decoder"
+    is_composition = True
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if "encoder" not in kwargs or "decoder" not in kwargs:
+            raise ValueError(
+                f"A configuraton of type {self.model_type} cannot be instantiated because "
+                f"both `encoder` and `decoder` sub-configurations were not passed, only {kwargs}"
+            )
+        self.encoder = kwargs.pop("encoder")
+        self.decoder = kwargs.pop("decoder")
+        self.is_encoder_decoder = True
+    @classmethod
+    def from_encoder_decoder_configs(
+        cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
+    ) -> PretrainedConfig:
+        r"""
+        Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model configuration and
+        decoder model configuration.
+        Returns:
+            [`EncoderDecoderConfig`]: An instance of a configuration object
+        """
+        logger.info("Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
+        decoder_config.is_decoder = True
+        decoder_config.add_cross_attention = True
+        return cls(encoder=encoder_config, decoder=decoder_config, **kwargs)

configuration_uniformer.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from transformers import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class UniFormerWithProjectionHeadConfig(PretrainedConfig):
+    model_type = 'uniformer'
+    def __init__(
+            self,
+            projection_size=None,
+            embed_dim=[64, 128, 320, 512],
+            image_size=384,
+            in_chans=3,
+            depth=[5, 8, 20, 7],
+            patch_size=[4, 2, 2, 2],
+            head_dim=64,
+            mlp_ratio=4,
+            qkv_bias=True,
+            num_classes=1000,
+            qk_scale=None,
+            representation_size=None,
+            drop_rate=0.0,
+            drop_path_rate=0.3,
+            attn_drop_rate=0.0,
+            conv_stem=False,
+            layer_norm_eps=1e-6,
+            **kwargs,
+        ):
+        super().__init__(
+            layer_norm_eps=layer_norm_eps,
+            image_size=image_size,
+            qkv_bias=qkv_bias,
+            **kwargs,
+        )
+        self.projection_size = projection_size
+        self.embed_dim = embed_dim
+        self.in_chans = in_chans
+        self.depth = depth
+        self.patch_size = patch_size
+        self.head_dim = head_dim
+        self.mlp_ratio = mlp_ratio
+        self.num_classes = num_classes
+        self.qk_scale = qk_scale
+        self.representation_size = representation_size
+        self.drop_rate = drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.conv_stem = conv_stem

create_section_files.py CHANGED Viewed

@@ -4,8 +4,10 @@ from pathlib import Path
 from tqdm import tqdm
-# local folder import
-from .section_parser import custom_mimic_cxr_rules, section_text
 def list_rindex(l, s):
@@ -98,7 +100,7 @@ def create_section_files(reports_path, output_path, no_split):
                 # exist the radiologist has usually written the report
                 # in the comparison section
                 idx = -1
-                for sn in ('impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'):
                     if sn in section_names:
                         idx = list_rindex(section_names, sn)
                         break
@@ -112,7 +114,7 @@ def create_section_files(reports_path, output_path, no_split):
                     patient_studies.append([s_stem, sections[idx].strip()])
                 study_sectioned = [s_stem]
-                for sn in ('impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'):
                     if sn in section_names:
                         idx = list_rindex(section_names, sn)
                         study_sectioned.append(sections[idx].strip())
@@ -125,7 +127,7 @@ def create_section_files(reports_path, output_path, no_split):
         with open(output_path / 'mimic_cxr_sectioned.csv', 'w') as fp:
             csvwriter = csv.writer(fp)
             # write header
-            csvwriter.writerow(['study', 'impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'])
             for row in study_sections:
                 csvwriter.writerow(row)

 from tqdm import tqdm
+try:
+    from .section_parser import custom_mimic_cxr_rules, section_text
+except ImportError:
+    from section_parser import custom_mimic_cxr_rules, section_text
 def list_rindex(l, s):
                 # exist the radiologist has usually written the report
                 # in the comparison section
                 idx = -1
+                for sn in ('impression', 'findings', 'indication', 'history', 'technique', 'last_paragraph', 'comparison'):
                     if sn in section_names:
                         idx = list_rindex(section_names, sn)
                         break
                     patient_studies.append([s_stem, sections[idx].strip()])
                 study_sectioned = [s_stem]
+                for sn in ('impression', 'findings', 'indication', 'history', 'technique', 'last_paragraph', 'comparison'):
                     if sn in section_names:
                         idx = list_rindex(section_names, sn)
                         study_sectioned.append(sections[idx].strip())
         with open(output_path / 'mimic_cxr_sectioned.csv', 'w') as fp:
             csvwriter = csv.writer(fp)
             # write header
+            csvwriter.writerow(['study', 'impression', 'findings', 'indication', 'history', 'technique', 'last_paragraph', 'comparison'])
             for row in study_sections:
                 csvwriter.writerow(row)

dataset.py CHANGED Viewed

@@ -1,253 +1,82 @@
-import os
-import lmdb
-import pandas as pd
 import torch
-from torch.utils.data import Dataset
-from torchvision.io import decode_image, read_image
-# Ordered by oblique, lateral, AP, and then PA views so that PA views are closest in position to the generated tokens (and oblique is furtherest).
-VIEW_ORDER = ['LPO', 'RAO', 'LAO', 'SWIMMERS', 'XTABLE LATERAL', 'LL', 'LATERAL',  'AP AXIAL', 'AP RLD', 'AP LLD', 'AP', 'PA RLD', 'PA LLD', 'PA']
-def mimic_cxr_image_path(dir, subject_id, study_id, dicom_id, ext='dcm'):
-    return os.path.join(dir, 'p' + str(subject_id)[:2], 'p' + str(subject_id),
-                        's' + str(study_id), str(dicom_id) + '.' + ext)
-class StudyIDEDStayIDSubset(Dataset):
-    """
-    Study ID & ED stay ID subset. Examples are indexed by the study identifier.
-    Information from the ED module is added by finding the study_id that is within
-    the timespan of the stay_id for the subject_id. The history and indication
-    sections are also included.
-    """
-    def __init__(
-        self,
-        split,
-        records,
-        mimic_cxr_jpg_lmdb_path=None,
-        mimic_cxr_dir=None,
-        max_images_per_study=None,
-        transforms=None,
-        images=True,
-        columns='study_id, dicom_id, subject_id, findings, impression',
-        and_condition='',
-        study_id_inclusion_list=None,
-        return_images=True,
-        ed_module=True,
-        extension='jpg',
-    ):
-        """
-        Argument/s:
-            split - 'train', 'validate', or 'test'.
-            records - MIMIC-CXR & MIMIC-IV-ED records class instance.
-            mimic_cxr_jpg_lmdb_path - JPG database for MIMIC-CXR-JPG.
-            mimic_cxr_dir - Path to the MIMIC-CXR directory containing the patient study subdirectories with the JPG or DCM images.
-            max_images_per_study - the maximum number of images per study.
-            transforms - torchvision transformations.
-            colour_space - PIL target colour space.
-            images - flag to return processed images.
-            columns - which columns to query on.
-            and_condition - AND condition to add to the SQL query.
-            study_id_inclusion_list - studies not in this list are excluded.
-            return_images - return CXR images for the study as tensors.
-            ed_module - use the ED module.
-            extension - 'jpg' or 'dcm'.
-        """
-        super(StudyIDEDStayIDSubset, self).__init__()
-        self.split = split
-        self.mimic_cxr_jpg_lmdb_path = mimic_cxr_jpg_lmdb_path
-        self.mimic_cxr_dir = mimic_cxr_dir
-        self.records = records
-        self.max_images_per_study = max_images_per_study
-        self.transforms = transforms
-        self.images = images
-        self.columns = columns
-        self.and_condition = and_condition
-        self.return_images = return_images
-        self.ed_module = ed_module
-        self.extension = extension
-        # If max images per study is not set:
-        self.max_images_per_study = float('inf') if self.max_images_per_study is None else self.max_images_per_study
-        assert self.extension == 'jpg' or self.extension == 'dcm', '"extension" can only be either "jpg" or "dcm".'
-        assert (mimic_cxr_jpg_lmdb_path is None) != (mimic_cxr_dir is None), 'Either "mimic_cxr_jpg_lmdb_path" or "mimic_cxr_dir" can be set.'
-        if self.mimic_cxr_dir is not None and self.mimic_cxr_jpg_lmdb_path is None:
-            if self.extension == 'jpg':
-                if 'physionet.org/files/mimic-cxr-jpg/2.0.0/files' not in self.mimic_cxr_dir:
-                    self.mimic_cxr_dir = os.path.join(self.mimic_cxr_dir, 'physionet.org/files/mimic-cxr-jpg/2.0.0/files')
-            elif self.extension == 'dcm':
-                if 'physionet.org/files/mimic-cxr/2.0.0/files' not in self.mimic_cxr_dir:
-                    self.mimic_cxr_dir = os.path.join(self.mimic_cxr_dir, 'physionet.org/files/mimic-cxr/2.0.0/files')
-        query = f"""
-        SELECT {columns}
-        FROM mimic_cxr
-        WHERE split = '{split}'
-        {and_condition}
-        ORDER BY study_id
-        """
-        # For multi-image, the study identifiers make up the training examples:
-        df = self.records.connect.sql(query).df()
-        # Drop studies that don't have a findings or impression section:
-        df = df.dropna(subset=['findings', 'impression'], how='any')
-        # This study has two rows in edstays (removed as it causes issues):
-        if self.ed_module:
-            df = df[df['study_id'] != 59128861]
-        # Exclude studies not in list:
-        if study_id_inclusion_list is not None:
-            df = df[df['study_id'].isin(study_id_inclusion_list)]
-        # Example study identifiers for the subset:
-        self.examples = df['study_id'].unique().tolist()
-        # Record statistics:
-        self.num_study_ids = len(self.examples)
-        self.num_dicom_ids = len(df['dicom_id'].unique().tolist())
-        self.num_subject_ids = len(df['subject_id'].unique().tolist())
-        # Prepare the LMDB .jpg database:
-        if self.mimic_cxr_jpg_lmdb_path is not None:
-            print('Loading images using LMDB.')
-            # Map size:
-            map_size = int(0.65 * (1024 ** 4))
-            assert isinstance(map_size, int)
-            self.env = lmdb.open(self.mimic_cxr_jpg_lmdb_path, map_size=map_size, lock=False, readonly=True)
-            self.txn = self.env.begin(write=False)
-    def __len__(self):
-        return self.num_study_ids
-    def __getitem__(self, index):
-        study_id = self.examples[index]
-        # Get the study:
-        study = self.records.connect.sql(
-            f"""
-            SELECT dicom_id, study_id, subject_id, study_datetime, ViewPosition
-            FROM mimic_cxr
-            WHERE (study_id = {study_id});
             """
-        ).df()
-        subject_id = study.iloc[0, study.columns.get_loc('subject_id')]
-        study_id = study.iloc[0, study.columns.get_loc('study_id')]
-        study_datetime = study['study_datetime'].max()
-        example_dict = {
-            'study_ids': study_id,
-            'subject_id': subject_id,
-            'index': index,
-        }
-        example_dict.update(self.records.return_mimic_cxr_features(study_id))
-        if self.ed_module:
-            edstays = self.records.connect.sql(
-                f"""
-                SELECT stay_id, intime, outtime
-                FROM edstays
-                WHERE (subject_id = {subject_id})
-                AND intime < '{study_datetime}'
-                AND outtime > '{study_datetime}';
-                """
-            ).df()
-            assert len(edstays) <= 1
-            stay_id = edstays.iloc[0, edstays.columns.get_loc('stay_id')] if not edstays.empty else None
-            self.records.clear_start_end_times()
-            example_dict.update(self.records.return_ed_module_features(stay_id, study_datetime))
-            example_dict['stay_ids'] = stay_id
-        if self.return_images:
-            example_dict['images'], example_dict['image_time_deltas'] = self.get_images(study, study_datetime)
-        return example_dict
-    def get_images(self, example, reference_time):
-        """
-        Get the image/s for a given example.
-        Argument/s:
-            example - dataframe for the example.
-            reference_time - reference_time for time delta.
-        Returns:
-            The image/s for the example
-        """
-        # Sample if over max_images_per_study. Only allowed during training:
-        if len(example) > self.max_images_per_study:
-            assert self.split == 'train'
-            example = example.sample(n=self.max_images_per_study, axis=0)
-        # Order by ViewPostion:
-        example['ViewPosition'] = example['ViewPosition'].astype(pd.CategoricalDtype(categories=VIEW_ORDER, ordered=True))
-        # Sort the DataFrame based on the categorical column
-        example = example.sort_values(by=['study_datetime', 'ViewPosition'])
-        # Load and pre-process each CXR:
-        images, time_deltas = [], []
-        for _, row in example.iterrows():
-            images.append(
-                self.load_and_preprocess_image(
-                    row['subject_id'],
-                    row['study_id'],
-                    row['dicom_id'],
-                ),
-            )
-            time_deltas.append(self.records.compute_time_delta(row['study_datetime'], reference_time, to_tensor=False))
-        if self.transforms is not None:
-            images = torch.stack(images, 0)
-        return images, time_deltas
-    def load_and_preprocess_image(self, subject_id, study_id, dicom_id):
-        """
-        Load and preprocess an image using torchvision.transforms.v2:
-            https://pytorch.org/vision/stable/auto_examples/transforms/plot_transforms_getting_started.html#sphx-glr-auto-examples-transforms-plot-transforms-getting-started-py
-        Argument/s:
-            subject_id - subject identifier.
-            study_id - study identifier.
-            dicom_id - DICOM identifier.
-        Returns:
-            image - Tensor of the CXR.
-        """
-        if self.extension == 'jpg':
-            if self.mimic_cxr_jpg_lmdb_path is not None:
-                # Convert to bytes:
-                key = bytes(dicom_id, 'utf-8')
-                # Retrieve image:
-                image = bytearray(self.txn.get(key))
-                image = torch.frombuffer(image, dtype=torch.uint8)
-                image = decode_image(image)
-            else:
-                image_file_path = mimic_cxr_image_path(self.mimic_cxr_dir, subject_id, study_id, dicom_id, self.extension)
-                image = read_image(image_file_path)
-        elif self.extension == 'dcm':
-            raise NotImplementedError
-        if self.transforms is not None:
-            image = self.transforms(image)
-        return image

+import itertools
+from typing import List
 import torch
+from .utils import compute_time_delta
+class PriorsDataset:
+    def __init__(self, dataset, history, time_delta_map):
+        self.dataset = dataset
+        self.history = history
+        self.study_id_to_index = dict(zip(dataset['study_id'], range(len(dataset))))
+        self.time_delta_map = time_delta_map
+        self.inf_time_delta_value = time_delta_map(float('inf'))
+    def __getitem__(self, idx):
+        batch = self.dataset[idx]
+        if self.history:
+            # Prior studies:
+            prior_study_indices = [
+                None if i is None else [self.study_id_to_index[j] for j in i[:self.history]] for i in batch['prior_study_ids']
+            ]
+            prior_studies = [None if i is None else [self.dataset[j] for j in i] for i in prior_study_indices]
+            # Prior time deltas:
+            time_deltas = [
+                None if i is None else [compute_time_delta(k['latest_study_datetime'], j, self.time_delta_map, to_tensor=False) for k in i] for i, j in zip(prior_studies, batch['latest_study_datetime'])
+            ]
+            # Prior findings and impressions:
+            batch['prior_findings'] = [
+                None if i is None else [j['findings'] for j in i] for i in prior_studies
+            ]
+            batch['prior_impression'] = [
+                None if i is None else [j['findings'] for j in i] for i in prior_studies
+            ]
+            batch['prior_findings_time_delta'] = time_deltas.copy()
+            batch['prior_impression_time_delta'] = time_deltas.copy()
+            # Prior images:
             """
+            Note:
+            Random selection of max_train_images_per_study from the study if the number of images for a study exceeds max_train_images_per_study is performed in train_set_transform and test_set_transform.
+            Sorting the images based on the view is done in test_set_transform.
+            No need to do it here.
+            """
+            prior_images = [
+                torch.cat(
+                    [
+                        torch.empty(0, *batch['images'].shape[-3:])
+                    ] if i is None else [j['images'] for j in i]
+                ) for i in prior_studies
+            ]
+            prior_images = torch.nn.utils.rnn.pad_sequence(prior_images, batch_first=True, padding_value=0.0)
+            batch['images'] = torch.cat([batch['images'], prior_images], dim=1)
+            prior_image_time_deltas = [
+                None if i is None else list(itertools.chain.from_iterable([y] * x['images'].shape[0] for x, y in zip(i, j)))
+                for i, j in zip(prior_studies, time_deltas)
+            ]
+            max_len = max((len(item) for item in prior_image_time_deltas if item is not None), default=0)
+            prior_image_time_deltas = [i + [self.inf_time_delta_value] * (max_len - len(i)) if i else [self.inf_time_delta_value] * max_len for i in prior_image_time_deltas]
+            batch['image_time_deltas'] = [i + j for i, j in zip(batch['image_time_deltas'], prior_image_time_deltas)]
+        return batch
+    def __len__(self):
+        return len(self.dataset)
+    def __getattr__(self, name):
+        return getattr(self.dataset, name)
+    def __getitems__(self, keys: List):
+        batch = self.__getitem__(keys)
+        n_examples = len(batch[next(iter(batch))])
+        return [{col: array[i] for col, array in batch.items()} for i in range(n_examples)]

generation_config.json CHANGED Viewed

@@ -3,5 +3,5 @@
   "bos_token_id": 1,
   "eos_token_id": 2,
   "pad_token_id": 4,
-  "transformers_version": "4.40.2"
 }

   "bos_token_id": 1,
   "eos_token_id": 2,
   "pad_token_id": 4,
+  "transformers_version": "4.39.3"
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e4b1ed2a5298bb8999cb91a9b905ace6733e5c66ebdef9702baa4d421428fad3
-size 644854104

 version https://git-lfs.github.com/spec/v1
+oid sha256:ffbf3e699a139ad98f20f8e057cd085586aea444b4b015471d697b43b440c14e
+size 789958760

modelling_cxrmate_ed.py CHANGED Viewed

@@ -1,33 +1,32 @@
 import math
 import os
-from glob import glob
-from pathlib import Path
 from typing import Optional, Tuple, Union
-import duckdb
-import pandas as pd
 import torch
 import transformers
 from torch.nn import CrossEntropyLoss
-from tqdm import tqdm
 from transformers import PreTrainedTokenizerFast, VisionEncoderDecoderModel
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import Seq2SeqLMOutput
 from transformers.modeling_utils import PreTrainedModel
-from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import (
-    VisionEncoderDecoderConfig,
-)
 from transformers.utils import logging
-from .create_section_files import create_section_files
-from .dataset import StudyIDEDStayIDSubset
-from .lmdb_jpg import prepare_mimic_cxr_jpg_lmdb
 from .modelling_uniformer import MultiUniFormerWithProjectionHead
-from .records import EDCXRSubjectRecords
-from .tables import ed_module_tables, mimic_cxr_tables
 logger = logging.get_logger(__name__)
 def create_lookup_table(df, columns, start_idx):
     df = df.groupby(columns).head(1)[columns].sort_values(by=columns)
@@ -49,12 +48,12 @@ class FNNEncoder(torch.nn.Module):
 class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
-    config_class = VisionEncoderDecoderConfig
     base_model_prefix = "vision_encoder_decoder"
     main_input_name = "input_ids"
     supports_gradient_checkpointing = True
-    def __init__(
         self,
         config: Optional[PretrainedConfig] = None,
         encoder: Optional[PreTrainedModel] = None,
@@ -70,7 +69,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
         if config is None and (encoder is None or decoder is None):
             raise ValueError("Either a configuration or an encoder and a decoder has to be provided.")
         if config is None:
-            config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
         else:
             if not isinstance(config, self.config_class):
                 raise ValueError(f"Config: {config} has to be of type {self.config_class}")
@@ -111,29 +110,50 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
         assert not config.decoder.is_encoder_decoder
         assert 'pad_token_id' in self.decoder.config.__dict__
         assert 'time_delta_monotonic_inversion' in self.decoder.config.__dict__
-        assert 'zero_time_delta_value' in self.decoder.config.__dict__
         assert 'add_time_deltas' in self.decoder.config.__dict__
         assert isinstance(self.decoder.config.time_delta_monotonic_inversion, bool)
-        assert isinstance(self.decoder.config.zero_time_delta_value, float)
-        for k, v in self.decoder.config.index_value_encoder_config.items():
-            setattr(
-                self,
-                f'{k}_index_value_encoder',
-                FNNEncoder(
-                    num_features=v,
-                    intermediate_size=self.decoder.config.index_value_encoder_intermediate_size,
-                    decoder_hidden_size=self.decoder.config.hidden_size,
-                ),
-            )
         if self.decoder.config.add_time_deltas:
             self.time_delta_encoder = FNNEncoder(
                 num_features=1,
                 intermediate_size=self.decoder.config.index_value_encoder_intermediate_size,
                 decoder_hidden_size=self.decoder.config.hidden_size,
             )
-        self.token_type_embeddings = torch.nn.Embedding(self.decoder.config.num_token_types, self.decoder.config.hidden_size)
     @classmethod
     def from_encoder_decoder_pretrained(
@@ -281,7 +301,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
             decoder = transformers.AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
         # instantiate config with corresponding kwargs
-        config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
         # make sure input & output embeddings is not tied
         config.tie_word_embeddings = False
@@ -292,13 +312,13 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
     def forward(
         self,
         decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.FloatTensor] = None,
-        decoder_token_type_ids: Optional[torch.LongTensor] = None,
         encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_position_ids: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -313,10 +333,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
             argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
         }
-        assert decoder_position_ids is not None
-        assert decoder_attention_mask is not None
         assert decoder_attention_mask.dtype == torch.long, f'The dtype for {decoder_attention_mask} was {decoder_attention_mask.dtype}. It should be torch.long'
-        assert decoder_token_type_ids is not None
         if decoder_inputs_embeds is None:
             decoder_inputs_embeds = self.decoder.get_input_embeddings()(decoder_input_ids)
@@ -362,7 +379,6 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
         special_token_ids,
         prompt_attention_mask,
         prompt_position_ids,
-        token_type_id_sections=None,
         past_key_values=None,
         use_cache=None,
         **kwargs,
@@ -387,7 +403,10 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
             # `inputs_embeds` are only to be used in the 1st generation step:
             inputs_embeds = torch.cat([kwargs['decoder_inputs_embeds'], self.decoder.get_input_embeddings()(input_ids)], dim=1)
-            decoder_token_type_ids = self.token_ids_to_token_type_ids(input_ids, special_token_ids, token_type_id_sections)
             decoder_token_type_ids = torch.cat(
                 [
                     kwargs['decoder_token_type_ids'],
@@ -411,7 +430,11 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
             decoder_position_ids.masked_fill_(report_attention_mask == 0, 1)
             # Always place token_ids_to_token_type_ids_past_key_values before input_ids = input_ids[:, remove_prefix_length:]:
-            decoder_token_type_ids = self.token_ids_to_token_type_ids_past_key_values(input_ids, special_token_ids, token_type_id_sections)
             decoder_position_ids = decoder_position_ids[:, -1:]
             past_length = past_key_values[0][0].shape[2]
@@ -437,7 +460,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
         )
         return input_dict
-    def token_ids_to_token_type_ids(self, token_ids, special_token_ids, token_type_id_sections=None):
         """
         Extract token type identifiers from the token identifiers.
@@ -480,7 +503,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
         return token_type_ids
-    def token_ids_to_token_type_ids_past_key_values(self, token_ids, special_token_ids, token_type_id_sections=None):
         """
         Extract token type identifiers from the token identifiers if past != None. Make sure to input all the
         token_ids (e.g., do not input input_ids = input_ids[:, remove_prefix_length:] from prepare_inputs_for_generation).
@@ -649,7 +672,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
         return tuple(sections.values())
-    def tokenize_text_columns(self, tokenizer: PreTrainedTokenizerFast, **kwargs):
         """
         Tokenize the text columns from MIMIC-IV ED and MIMIC-CXR (excluding the findings and impression sections).
         Time deltas for the input_ids are also prepared here.
@@ -662,7 +685,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
             cxr - dictionary containing the input_ids, token_type_ids, and attention_mask for MIMIC-CXR columns.
         """
-        batch_size = len(kwargs['index'])
         tokenized = {
             'input_ids': {i: [] for i in range(batch_size)},
@@ -671,34 +694,37 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
             'attention_mask': torch.empty(batch_size, 0, 1, device=self.device),
         }
-        for i in self.decoder.config.ed_module_columns + self.decoder.config.mimic_cxr_columns + ['previous_findings', 'previous_impression']:
             if i in kwargs:
                 if f'{i}_time_delta' not in kwargs:
-                    kwargs[f'{i}_time_delta'] = [[self.decoder.config.zero_time_delta_value for _ in j] if j is not None else None for j in kwargs[i]]
                 for x, (y, z) in enumerate(zip(kwargs[i], kwargs[f'{i}_time_delta'])):
                     if y is not None:
                         assert isinstance(y, list)
                         assert isinstance(z, list)
                         for text, time_delta in zip(y, z):
-                            tokenized['input_ids'][x].append(
-                                tokenizer(text, add_special_tokens=False, return_tensors='pt')['input_ids'].to(device=self.device)
-                            )
-                            tokenized['token_type_ids'][x].append(
-                                torch.full(
-                                    (1, tokenized['input_ids'][x][-1].shape[-1]),
-                                    self.decoder.config.token_type_to_token_type_id[i],
-                                    dtype=torch.long,
-                                    device=self.device,
                                 )
-                            )
-                            tokenized['time_delta'][x].append(
-                                torch.full(
-                                    (1, tokenized['input_ids'][x][-1].shape[-1]),
-                                    time_delta,
-                                    dtype=torch.float32,
-                                    device=self.device,
                                 )
-                            )
         tokenized['input_ids'] = [torch.cat(j, dim=1).T if j else torch.empty(0, 1, dtype=torch.long, device=self.device) for j in tokenized['input_ids'].values()]
         tokenized['token_type_ids'] = [torch.cat(j, dim=1).T if j else torch.empty(0, 1, dtype=torch.long, device=self.device) for j in tokenized['token_type_ids'].values()]
@@ -725,7 +751,6 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
         tokenizer: PreTrainedTokenizerFast,
         tokenized_report=None,
         sep_token_id=None,
-        section_ids=None,
         **batch,
     ):
         """
@@ -736,8 +761,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
             tokenizer - Hugging Face tokenizer.
             tokenized_report - if training/teacher forcing, input the tokenized_report dict to include it in the prepared inputs.
             separator_token_id - separator token identifier.
-            section_ids - section identifiers for the findings and impression sections.
         Returns:
             inputs_embeds - input embeddings.
             attention_mask - attention mask.
@@ -755,23 +779,24 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
         bos_token_ids = None
         # Index and value columns:
-        batch_size = len(batch['index'])
-        for k in self.decoder.config.index_value_encoder_config.keys():
-            if f'{k}_index_value_feats' not in batch:
-                batch[f'{k}_index_value_feats'] = torch.empty(batch_size, 0, self.decoder.config.index_value_encoder_config[k], device=self.device)
-            inputs_embeds.append(
-                getattr(self, f'{k}_index_value_encoder')(batch[f'{k}_index_value_feats'])
-            )
-            token_type_ids.append(batch[f'{k}_index_value_token_type_ids'] if f'{k}_index_value_token_type_ids' in batch else torch.empty(batch_size, 0, dtype=torch.long, device=self.device))
-            attention_mask.append(batch[f'{k}_index_value_mask'] if f'{k}_index_value_mask' in batch else torch.empty(batch_size, 0, dtype=torch.long, device=self.device))
-            if f'{k}_time_delta' in batch:
-                time_delta.append(batch[f'{k}_time_delta'])
-            else:
-                time_delta_index_value = torch.zeros(*batch[f'{k}_index_value_mask'].shape, 1, device=self.device) if f'{k}_index_value_mask' in batch else torch.empty(batch_size, 0, 1, device=self.device)
-                time_delta.append(time_delta_index_value)
         # Tokenize text columns for prompt:
-        tokenized = self.tokenize_text_columns(tokenizer, **batch)
         input_ids.append(tokenized['input_ids'])
         token_type_ids.append(tokenized['token_type_ids'])
         attention_mask.append(tokenized['attention_mask'])
@@ -780,14 +805,17 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
         # Image encoder:
         encoder_outputs = self.encoder(images)
         inputs_embeds.append(encoder_outputs[0])
         inputs_per_image = encoder_outputs[0].shape[-2] // images.shape[1]
-        padded_image_time_deltas = [i + [self.decoder.config.zero_time_delta_value] * (images.shape[1] - len(i)) for i in batch['image_time_deltas']]
-        time_delta_image_features = torch.tensor(padded_image_time_deltas, device=self.device).repeat_interleave(inputs_per_image, dim=1)
         token_type_ids.append(
             torch.where(
-                time_delta_image_features == self.decoder.config.zero_time_delta_value,
-                self.decoder.config.token_type_to_token_type_id['image'],
-                self.decoder.config.token_type_to_token_type_id['previous_image'],
             ),
         )
         attention_mask.append(encoder_outputs[1])
@@ -819,7 +847,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
             report_token_type_ids = self.token_ids_to_token_type_ids(
                 token_ids=tokenized_report['decoder_input_ids'],
                 special_token_ids=[sep_token_id],
-                token_type_id_sections=section_ids,
             )
             token_type_ids.append(report_token_type_ids)
@@ -906,8 +934,11 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
         return mixed_causality_4d_attention_mask
     def position_ids_from_time_deltas_and_attention_mask(self, time_deltas, attention_mask):
-        _, col_indices = torch.sort(torch.where(attention_mask == 1, time_deltas[:, :, 0], torch.finfo(time_deltas.dtype).min), descending=not self.decoder.config.time_delta_monotonic_inversion)
         num_rows, num_cols, _ = time_deltas.shape
         row_indices = torch.arange(num_rows, device=time_deltas.device).view(-1, 1).repeat(1, num_cols).view(-1)
@@ -917,272 +948,316 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
         return position_ids
-    @staticmethod
-    def prepare_data(physionet_dir, database_dir):
-        Path(database_dir).mkdir(parents=True, exist_ok=True)
-        mimic_iv_duckdb_path = os.path.join(database_dir, 'mimic_iv_duckdb.db')
-        mimic_cxr_jpg_lmdb_path = os.path.join(database_dir, 'mimic_cxr_jpg_lmdb.db')
-        sectioned_dir = os.path.join(database_dir, 'mimic_cxr_sectioned')
-        mimic_cxr_sectioned_path = os.path.join(sectioned_dir, 'mimic_cxr_sectioned.csv')
-        if not os.path.exists(mimic_cxr_sectioned_path):
-            print(f'{mimic_cxr_sectioned_path} does not exist, creating...')
-            # Check if reports exist. Reports for the first and last patients are checked only for speed, this comprimises comprehensiveness for speed:
-            report_paths = [
-                os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p10/p10000032/s50414267.txt'),
-                os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p10/p10000032/s53189527.txt'),
-                os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p10/p10000032/s53911762.txt'),
-                os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p10/p10000032/s56699142.txt'),
-                os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p19/p19999987/s55368167.txt'),
-                os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p19/p19999987/s58621812.txt'),
-                os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p19/p19999987/s58971208.txt'),
-            ]
-            assert all([os.path.isfile(i) for i in report_paths]), f"""The reports do not exist with the following regex: {os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p1*/p1*/s*.txt')}.
-            "Please download them using wget -r -N -c -np --reject dcm --user <username> --ask-password https://physionet.org/files/mimic-cxr/2.0.0/"""
-            print('Extracting sections from reports...')
-            create_section_files(
-                reports_path=os.path.join(physionet_dir, 'mimic-cxr', '2.0.0', 'files'),
-                output_path=sectioned_dir,
-                no_split=True,
-            )
-        if not os.path.exists(mimic_iv_duckdb_path):
-            connect = duckdb.connect(mimic_iv_duckdb_path)
-            csv_paths = []
-            csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'edstays.csv.gz'))[0])
-            csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'medrecon.csv.gz'))[0])
-            csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'pyxis.csv.gz'))[0])
-            csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'triage.csv.gz'))[0])
-            csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'vitalsign.csv.gz'))[0])
-            base_names = [os.path.basename(i) for i in csv_paths]
-            for i in ['edstays.csv.gz', 'medrecon.csv.gz', 'pyxis.csv.gz', 'triage.csv.gz', 'vitalsign.csv.gz']:
-                assert i in base_names, f"""Table {i} is missing from MIMIC-IV-ED.
-                    Please download the tables from https://physionet.org/content/mimic-iv-ed. Do not decompress them."""
-            csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-cxr-jpg', '*', 'mimic-cxr-2.0.0-metadata.csv.gz'))[0])
-            csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-cxr-jpg', '*', 'mimic-cxr-2.0.0-chexpert.csv.gz'))[0])
-            csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-cxr-jpg', '*', 'mimic-cxr-2.0.0-split.csv.gz'))[0])
-            base_names = [os.path.basename(i) for i in csv_paths[-3:]]
-            for i in ['mimic-cxr-2.0.0-metadata.csv.gz', 'mimic-cxr-2.0.0-chexpert.csv.gz', 'mimic-cxr-2.0.0-split.csv.gz']:
-                assert i in base_names, f"""CSV file {i} is missing from MIMIC-IV-ED.
-                    Please download the tables from https://physionet.org/content/mimic-cxr-jpg. Do not decompress them."""
-            for i in csv_paths:
-                name = Path(i).stem.replace('.csv', '').replace('.gz', '').replace('-', '_').replace('.', '_')
-                print(f'Copying {name} into database...')
-                connect.sql(f"CREATE OR REPLACE TABLE {name} AS FROM '{i}';")
-            # MIMIC-CXR report sections:
-            print(f'Copying mimic_cxr_sectioned into database...')
-            connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr_sectioned AS FROM '{mimic_cxr_sectioned_path}';")
-            columns = list(connect.sql('FROM mimic_cxr_sectioned LIMIT 1').df().columns)
-            if 'column0' in columns:  # If the column headers are not read correctly:
-                connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column0 TO study;")
-                connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column1 TO impression;")
-                connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column2 TO findings;")
-                connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column3 TO indication;")
-                connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column4 TO history;")
-                connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column5 TO last_paragraph;")
-                connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column6 TO comparison;")
-                connect.sql("DELETE FROM mimic_cxr_sectioned WHERE study='study';")
-            splits = connect.sql("FROM mimic_cxr_2_0_0_split").df()
-            reports = connect.sql("FROM mimic_cxr_sectioned").df()
-            metadata = connect.sql("FROM mimic_cxr_2_0_0_metadata").df()
-            chexpert = connect.sql("FROM mimic_cxr_2_0_0_chexpert").df()
-            # Create datetime column:
-            metadata['StudyTime'] = metadata['StudyTime'].astype(int)
-            metadata['study_datetime'] = pd.to_datetime(
-                metadata.apply(lambda x: f'{x["StudyDate"]} {x["StudyTime"]:06}', axis=1),
-                format='%Y%m%d %H%M%S',
-            )
-            reports.rename(columns={'study': 'study_id'}, inplace=True)
-            reports.study_id = reports.study_id.str[1:].astype('int32')
-            df = pd.merge(splits, reports, on='study_id')
-            df = pd.merge(df, metadata, on=['dicom_id', 'study_id', 'subject_id'])
-            df = pd.merge(df, chexpert, on=['study_id', 'subject_id'])
-            connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr AS SELECT * FROM df")
-            # Create lookup tables:
-            for k, v in (ed_module_tables | mimic_cxr_tables).items():
-                if v.load and v.index_columns:
-                    start_idx = 0
-                    for i in v.index_columns_source:
-                        lut_name = f'{k}_{i}_lut'
-                        table = k
-                        lut, end_idx = create_lookup_table(connect.sql(f"SELECT {i} FROM {table}").df(), [i], start_idx)
-                        start_idx = end_idx + 1
-                        lut = lut.rename(columns={'index': f'{i}_index'})
-                        print(f'Creating {lut_name}...')
-                        connect.sql(f"CREATE OR REPLACE TABLE {lut_name} AS SELECT * FROM lut")
-                        if f'{i}_index' in connect.sql(f"FROM {k} LIMIT 0").df().columns:
-                            connect.sql(
-                                f"""
-                                ALTER TABLE {k}
-                                DROP COLUMN {i}_index;
-                                """
-                            )
-                        connect.sql(
-                            f"""
-                                CREATE OR REPLACE TABLE {k} AS
-                                SELECT {k}.*, {lut_name}.{i}_index
-                                FROM {k} LEFT JOIN {lut_name}
-                                ON {k}.{i} = {lut_name}.{i}
-                            """
-                        )
-                    connect.sql(
-                        f"""
-                            CREATE TABLE IF NOT EXISTS lut_info (table_name VARCHAR PRIMARY KEY, start_index INT, end_index INT);
-                            INSERT OR REPLACE INTO lut_info VALUES ('{k}', {0}, {end_idx});
-                        """
-                    )
-            table_studies = {
-                'edstays': [],
-                'triage': [],
-                'medrecon': [],
-                'vitalsign': [],
-                'pyxis': [],
-            }
-            stay_id_tables = ['triage']
-            stay_id_charttime_tables = ['medrecon', 'vitalsign', 'pyxis']
-            df = connect.sql(f"FROM mimic_cxr").df()
-            # DICOM identifiers can have different datetimes, so use most recent datetime for the study:
-            df = df.sort_values(by='study_datetime', ascending=False)
-            df = df.groupby('study_id').first().reset_index()
-            print('Searching for studies associated with an ED stay...')
-            for _, row in tqdm(df.iterrows(), total=df.shape[0]):
-                edstays = connect.sql(
-                    f"""
-                    SELECT stay_id, intime, outtime
-                    FROM edstays
-                    WHERE (subject_id = {row['subject_id']})
-                    AND intime < '{row['study_datetime']}'
-                    AND outtime > '{row['study_datetime']}';
-                    """
-                ).df()
-                if len(edstays) > 0:
-                    for i in edstays['stay_id'].to_list():
-                        table_studies['edstays'].append({'study_id': row['study_id'], 'stay_id': i})
-                        for j in stay_id_tables:
-                            table = connect.sql(
-                                f"""
-                                SELECT stay_id
-                                FROM {j}
-                                WHERE (stay_id = {i});
-                                """
-                            ).df()
-                            for k in table['stay_id'].to_list():
-                                table_studies[j].append({'study_id': row['study_id'], 'stay_id': k})
-                        for j in stay_id_charttime_tables:
-                            table = connect.sql(
-                                f"""
-                                SELECT stay_id
-                                FROM {j}
-                                WHERE (stay_id = {i})
-                                AND charttime < '{row['study_datetime']}';
-                                """
-                            ).df()
-                            for k in table['stay_id'].to_list():
-                                table_studies[j].append({'study_id': row['study_id'], 'stay_id': k})
-            for k, v in table_studies.items():
-                df = pd.DataFrame(v)
-                df = df.drop_duplicates(subset=['study_id', 'stay_id'])
-                connect.sql(f"CREATE TABLE {k}_study_ids AS SELECT * FROM df")
-            connect.close()
-        if not os.path.exists(mimic_cxr_jpg_lmdb_path):
-            print('Preparing MIMIC-CXR-JPG LMDB database...')
-            pattern = os.path.join(physionet_dir, 'mimic-cxr-jpg', '*', 'files')
-            mimic_cxr_jpg_dir = glob(pattern)
-            assert len(mimic_cxr_jpg_dir), f'Multiple directories matched the pattern {pattern}: {mimic_cxr_jpg_dir}. Only one is required.'
-            prepare_mimic_cxr_jpg_lmdb(
-                mimic_iv_duckdb_path=mimic_iv_duckdb_path,
-                mimic_cxr_jpg_dir=mimic_cxr_jpg_dir[0],
-                mimic_cxr_jpg_lmdb_path=mimic_cxr_jpg_lmdb_path,
-                map_size_tb=0.65
-            )
-    @staticmethod
-    def get_dataset(split, transforms, database_dir, max_images_per_study=5, mimic_cxr_jpg_dir=None, records=None):
-        mimic_iv_duckdb_path = os.path.join(database_dir, 'mimic_iv_duckdb.db')
-        mimic_cxr_jpg_lmdb_path = os.path.join(database_dir, 'mimic_cxr_jpg_lmdb.db') if mimic_cxr_jpg_dir is None else None
-        if records is None:
-            # This is the setup for CXRs + all effective inputs - medicine reconciliation:
-            records = EDCXRSubjectRecords(database_path=mimic_iv_duckdb_path, time_delta_map=lambda x: 1 / math.sqrt(x + 1))
-            records.ed_module_tables = {k: records.ed_module_tables[k] for k in ['edstays', 'triage', 'vitalsign']}
-            records.mimic_cxr_tables = {k: records.mimic_cxr_tables[k] for k in ['mimic_cxr_sectioned']}
-            records.mimic_cxr_tables['mimic_cxr_sectioned'].text_columns = ['indication', 'history']
-        dataset = StudyIDEDStayIDSubset(
-                mimic_cxr_jpg_lmdb_path=mimic_cxr_jpg_lmdb_path,
-                mimic_cxr_dir=mimic_cxr_jpg_dir,
-                transforms=transforms,
-                split=split,
-                max_images_per_study=max_images_per_study,
-                records=records,
             )
-        print(f'No. of examples: {dataset.__len__()}.')
-        print(
-            f'No. of training dicom_ids, study_ids, & subject_ids: {dataset.num_dicom_ids},',
-            f'{dataset.num_study_ids}, & {dataset.num_subject_ids}.',
-        )
-        return dataset
     @staticmethod
     def collate_fn(batch):
         keys = set().union(*(d.keys() for d in batch))
         batch = {j: [i.setdefault(j, None) for i in batch] for j in keys}
-        batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
-        for k in keys:
-            if 'index_value_feats' in k:
-                total_indices = next(i for i in batch[k] if i is not None).shape[-1]
-                batch[k] = [i if i is not None else torch.empty(0, total_indices) for i in batch[k]]
-                batch[k] = torch.nn.utils.rnn.pad_sequence(batch[k], batch_first=True, padding_value=-1)  # Pad value of -1 is not ideal. Need to use something else.
-                token_type_id_name = k.replace('_feats', '_token_type_ids')
-                batch[token_type_id_name] = [i if i is not None else torch.empty(0, dtype=torch.long) for i in batch[token_type_id_name]]
-                batch[token_type_id_name] = torch.nn.utils.rnn.pad_sequence(
-                    batch[token_type_id_name], batch_first=True, padding_value=0,
-                )
-                mask_name = k.replace('_feats', '_mask')
-                batch[mask_name] = (batch[k] != -1).any(dim=-1).int()
-            if 'time_delta' in k and 'index_value' in k:
-                batch[k] = [i if i is not None else torch.empty(0, 1) for i in batch[k]]
-                batch[k] = torch.nn.utils.rnn.pad_sequence(batch[k], batch_first=True, padding_value=0)
-        return batch

+import json
 import math
 import os
+import random
 from typing import Optional, Tuple, Union
+import datasets
 import torch
 import transformers
 from torch.nn import CrossEntropyLoss
+from torch.utils.data import Subset
+from torchvision.io import decode_image
 from transformers import PreTrainedTokenizerFast, VisionEncoderDecoderModel
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import Seq2SeqLMOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
+from .configuration_cxrmate_ed import EncoderDecoderConfig
+from .dataset import PriorsDataset
 from .modelling_uniformer import MultiUniFormerWithProjectionHead
+from .prepare_dataset import prepare_dataset
+from .utils import compute_time_delta
 logger = logging.get_logger(__name__)
+# Ordered by oblique, lateral, AP, and then PA views so that PA views are closest in position to the generated tokens (and oblique is furtherest).
+VIEW_ORDER = [None, 'LPO', 'RAO', 'LAO', 'SWIMMERS', 'XTABLE LATERAL', 'LL', 'LATERAL',  'AP AXIAL', 'AP RLD', 'AP LLD', 'AP', 'PA RLD', 'PA LLD', 'PA']
 def create_lookup_table(df, columns, start_idx):
     df = df.groupby(columns).head(1)[columns].sort_values(by=columns)
 class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
+    config_class = EncoderDecoderConfig
     base_model_prefix = "vision_encoder_decoder"
     main_input_name = "input_ids"
     supports_gradient_checkpointing = True
+    def __init__(
         self,
         config: Optional[PretrainedConfig] = None,
         encoder: Optional[PreTrainedModel] = None,
         if config is None and (encoder is None or decoder is None):
             raise ValueError("Either a configuration or an encoder and a decoder has to be provided.")
         if config is None:
+            config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
         else:
             if not isinstance(config, self.config_class):
                 raise ValueError(f"Config: {config} has to be of type {self.config_class}")
         assert not config.decoder.is_encoder_decoder
         assert 'pad_token_id' in self.decoder.config.__dict__
         assert 'time_delta_monotonic_inversion' in self.decoder.config.__dict__
         assert 'add_time_deltas' in self.decoder.config.__dict__
+        assert 'history' in self.decoder.config.__dict__
+        assert 'tables_filter' in self.decoder.config.__dict__
+        assert 'prompt_report_sections_filter' in self.decoder.config.__dict__
         assert isinstance(self.decoder.config.time_delta_monotonic_inversion, bool)
+        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tables.json'), 'r') as f:
+            self.tables = json.load(f)
+        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'lookup_tables.json'), 'r') as f:
+            self.luts = json.load(f)
+        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'token_type_ids.json'), 'r') as f:
+            self.token_type_to_token_type_id = json.load(f)
+        self.tables = {k: self.tables[k] for k in self.decoder.config.tables_filter}
+        self.tables['mimic_cxr_sectioned']['text_columns'] = self.decoder.config.prompt_report_sections_filter
+        for k in self.tables.keys():
+            if self.luts[k]['total'] > 0:
+                setattr(
+                    self,
+                    f'{k}_index_value_encoder',
+                    FNNEncoder(
+                        num_features=self.luts[k]['total'],
+                        intermediate_size=self.decoder.config.index_value_encoder_intermediate_size,
+                        decoder_hidden_size=self.decoder.config.hidden_size,
+                    ),
+                )
         if self.decoder.config.add_time_deltas:
             self.time_delta_encoder = FNNEncoder(
                 num_features=1,
                 intermediate_size=self.decoder.config.index_value_encoder_intermediate_size,
                 decoder_hidden_size=self.decoder.config.hidden_size,
             )
+        self.token_type_embeddings = torch.nn.Embedding(max(self.token_type_to_token_type_id.values()) + 1, self.decoder.config.hidden_size)
+        self.time_delta_map = lambda x: 1 / math.sqrt(x + 1)
+        self.zero_time_delta_value = self.time_delta_map(0)
+        self.inf_time_delta_value = self.time_delta_map(float('inf'))
     @classmethod
     def from_encoder_decoder_pretrained(
             decoder = transformers.AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
         # instantiate config with corresponding kwargs
+        config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
         # make sure input & output embeddings is not tied
         config.tie_word_embeddings = False
     def forward(
         self,
+        decoder_position_ids: torch.LongTensor,
+        decoder_attention_mask: torch.FloatTensor,
+        decoder_token_type_ids: torch.LongTensor,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
             argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
         }
         assert decoder_attention_mask.dtype == torch.long, f'The dtype for {decoder_attention_mask} was {decoder_attention_mask.dtype}. It should be torch.long'
         if decoder_inputs_embeds is None:
             decoder_inputs_embeds = self.decoder.get_input_embeddings()(decoder_input_ids)
         special_token_ids,
         prompt_attention_mask,
         prompt_position_ids,
         past_key_values=None,
         use_cache=None,
         **kwargs,
             # `inputs_embeds` are only to be used in the 1st generation step:
             inputs_embeds = torch.cat([kwargs['decoder_inputs_embeds'], self.decoder.get_input_embeddings()(input_ids)], dim=1)
+            decoder_token_type_ids = self.token_ids_to_token_type_ids(
+                input_ids, special_token_ids,
+                [self.token_type_to_token_type_id['findings'], self.token_type_to_token_type_id['impression']],
+            )
             decoder_token_type_ids = torch.cat(
                 [
                     kwargs['decoder_token_type_ids'],
             decoder_position_ids.masked_fill_(report_attention_mask == 0, 1)
             # Always place token_ids_to_token_type_ids_past_key_values before input_ids = input_ids[:, remove_prefix_length:]:
+            decoder_token_type_ids = self.token_ids_to_token_type_ids_past_key_values(
+                input_ids,
+                special_token_ids,
+                [self.token_type_to_token_type_id['findings'], self.token_type_to_token_type_id['impression']],
+            )
             decoder_position_ids = decoder_position_ids[:, -1:]
             past_length = past_key_values[0][0].shape[2]
         )
         return input_dict
+    def token_ids_to_token_type_ids(self, token_ids, special_token_ids, token_type_id_sections):
         """
         Extract token type identifiers from the token identifiers.
         return token_type_ids
+    def token_ids_to_token_type_ids_past_key_values(self, token_ids, special_token_ids, token_type_id_sections):
         """
         Extract token type identifiers from the token identifiers if past != None. Make sure to input all the
         token_ids (e.g., do not input input_ids = input_ids[:, remove_prefix_length:] from prepare_inputs_for_generation).
         return tuple(sections.values())
+    def tokenize_text_prompt(self, tokenizer: PreTrainedTokenizerFast, **kwargs):
         """
         Tokenize the text columns from MIMIC-IV ED and MIMIC-CXR (excluding the findings and impression sections).
         Time deltas for the input_ids are also prepared here.
             cxr - dictionary containing the input_ids, token_type_ids, and attention_mask for MIMIC-CXR columns.
         """
+        batch_size = len(kwargs['study_id'])
         tokenized = {
             'input_ids': {i: [] for i in range(batch_size)},
             'attention_mask': torch.empty(batch_size, 0, 1, device=self.device),
         }
+        prompt_text_columns = [f'{k}_{j}' if k != 'mimic_cxr_sectioned' else j for k, v in self.tables.items() if 'text_columns' in v for j in (v['text_columns'] if isinstance(v['text_columns'], list) else [v['text_columns']])] + ['prior_findings', 'prior_impression']
+        for i in prompt_text_columns:
             if i in kwargs:
                 if f'{i}_time_delta' not in kwargs:
+                    kwargs[f'{i}_time_delta'] = [[self.zero_time_delta_value for _ in j] if j is not None else None for j in kwargs[i]]
                 for x, (y, z) in enumerate(zip(kwargs[i], kwargs[f'{i}_time_delta'])):
                     if y is not None:
                         assert isinstance(y, list)
                         assert isinstance(z, list)
                         for text, time_delta in zip(y, z):
+                            if text is not None:
+                                tokenized['input_ids'][x].append(
+                                    tokenizer(text, add_special_tokens=False, return_tensors='pt')['input_ids'].to(device=self.device)
                                 )
+                                tokenized['token_type_ids'][x].append(
+                                    torch.full(
+                                        (1, tokenized['input_ids'][x][-1].shape[-1]),
+                                        self.token_type_to_token_type_id[i],
+                                        dtype=torch.long,
+                                        device=self.device,
+                                    )
+                                )
+                                tokenized['time_delta'][x].append(
+                                    torch.full(
+                                        (1, tokenized['input_ids'][x][-1].shape[-1]),
+                                        time_delta,
+                                        dtype=torch.float32,
+                                        device=self.device,
+                                    )
                                 )
         tokenized['input_ids'] = [torch.cat(j, dim=1).T if j else torch.empty(0, 1, dtype=torch.long, device=self.device) for j in tokenized['input_ids'].values()]
         tokenized['token_type_ids'] = [torch.cat(j, dim=1).T if j else torch.empty(0, 1, dtype=torch.long, device=self.device) for j in tokenized['token_type_ids'].values()]
         tokenizer: PreTrainedTokenizerFast,
         tokenized_report=None,
         sep_token_id=None,
         **batch,
     ):
         """
             tokenizer - Hugging Face tokenizer.
             tokenized_report - if training/teacher forcing, input the tokenized_report dict to include it in the prepared inputs.
             separator_token_id - separator token identifier.
         Returns:
             inputs_embeds - input embeddings.
             attention_mask - attention mask.
         bos_token_ids = None
         # Index and value columns:
+        batch_size = images.shape[0]
+        for k, v in self.tables.items():
+            if 'index_columns' in v or 'value_columns' in v:
+                if f'{k}_index_value_feats' not in batch:
+                    batch[f'{k}_index_value_feats'] = torch.empty(batch_size, 0, self.luts[k]['total'], device=self.device)
+                inputs_embeds.append(
+                    getattr(self, f'{k}_index_value_encoder')(batch[f'{k}_index_value_feats'])
+                )
+                token_type_ids.append(batch[f'{k}_index_value_token_type_ids'] if f'{k}_index_value_token_type_ids' in batch else torch.empty(batch_size, 0, dtype=torch.long, device=self.device))
+                attention_mask.append(batch[f'{k}_index_value_mask'] if f'{k}_index_value_mask' in batch else torch.empty(batch_size, 0, dtype=torch.long, device=self.device))
+                if f'{k}_index_value_time_delta' in batch:
+                    time_delta.append(batch[f'{k}_index_value_time_delta'])
+                else:
+                    time_delta_index_value = torch.zeros(*batch[f'{k}_index_value_mask'].shape, 1, device=self.device) if f'{k}_index_value_mask' in batch else torch.empty(batch_size, 0, 1, device=self.device)
+                    time_delta.append(time_delta_index_value)
         # Tokenize text columns for prompt:
+        tokenized = self.tokenize_text_prompt(tokenizer, **batch)
         input_ids.append(tokenized['input_ids'])
         token_type_ids.append(tokenized['token_type_ids'])
         attention_mask.append(tokenized['attention_mask'])
         # Image encoder:
         encoder_outputs = self.encoder(images)
         inputs_embeds.append(encoder_outputs[0])
         inputs_per_image = encoder_outputs[0].shape[-2] // images.shape[1]
+        time_delta_image_features = torch.tensor(batch['image_time_deltas'], device=self.device).repeat_interleave(inputs_per_image, dim=1)
         token_type_ids.append(
             torch.where(
+                torch.logical_or(
+                    time_delta_image_features == self.zero_time_delta_value,
+                    time_delta_image_features == self.inf_time_delta_value,
+                ),
+                self.token_type_to_token_type_id['image'],
+                self.token_type_to_token_type_id['prior_image'],
             ),
         )
         attention_mask.append(encoder_outputs[1])
             report_token_type_ids = self.token_ids_to_token_type_ids(
                 token_ids=tokenized_report['decoder_input_ids'],
                 special_token_ids=[sep_token_id],
+                token_type_id_sections=[self.token_type_to_token_type_id['findings'], self.token_type_to_token_type_id['impression']],
             )
             token_type_ids.append(report_token_type_ids)
         return mixed_causality_4d_attention_mask
     def position_ids_from_time_deltas_and_attention_mask(self, time_deltas, attention_mask):
+        mask_value = torch.finfo(time_deltas.dtype).max if self.decoder.config.time_delta_monotonic_inversion else torch.finfo(time_deltas.dtype).min
+        masked_time_deltas = torch.where(attention_mask == 1, time_deltas[:, :, 0], mask_value)
+        _, col_indices = torch.sort(masked_time_deltas, descending=not self.decoder.config.time_delta_monotonic_inversion)
         num_rows, num_cols, _ = time_deltas.shape
         row_indices = torch.arange(num_rows, device=time_deltas.device).view(-1, 1).repeat(1, num_cols).view(-1)
         return position_ids
+    def get_dataset(self, dataset_path, train_transforms, test_transforms, max_train_images_per_study, study_id_split='mimic_iv_ed_mimic_cxr_jpg', test_set_only=False):
+        def train_set_transform(batch):
+            # Randomly select max_train_images_per_study if the number of images for a study exceeds max_train_images_per_study.
+            keys = ['images', 'dicom_id']
+            keys = keys + self.tables['mimic_cxr_2_0_0_metadata']['index_columns'] if 'mimic_cxr_2_0_0_metadata' in self.tables else keys
+            for i in range(len(batch['images'])):
+                if len(batch['images'][i]) > max_train_images_per_study:
+                    paired = list(zip(*(batch[key][i] for key in keys)))
+                    sampled_pairs = random.sample(paired, max_train_images_per_study)
+                    unzipped_samples = zip(*sampled_pairs)
+                    for key, values in zip(keys, unzipped_samples):
+                        batch[key][i] = list(values)
+            batch['images'] = [[decode_image(torch.frombuffer(bytearray(j), dtype=torch.uint8)) for j in i] for i in batch['images']]
+            # Sort based on ViewPosition:
+            batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
+            batch['images'] = [torch.stack([train_transforms(j) for j in i]) for i in batch['images']]
+            max_size = max(i.shape[0] for i in batch['images'])
+            batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
+            batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
+            for k, v in self.tables.items():
+                if 'index_columns' in v or 'value_columns' in v:
+                    batch[f'{k}_index_value_feats'],  batch[f'{k}_index_value_token_type_ids'], batch[f'{k}_index_value_time_delta'], batch[f'{k}_index_value_mask'] = self.prepare_index_value_feats(k, batch)
+            for k, v in self.tables.items():
+                if 'text_columns' in v:
+                    for i in v['text_columns']:
+                        key = f'{k}_{i}' if not k == 'mimic_cxr_sectioned' else i
+                        batch[key], batch[f'{key}_time_delta'] = self.prepare_text_prompt(k, i, batch)
+            return batch
+        def test_set_transform(batch):
+            batch['images'] = [[decode_image(torch.frombuffer(bytearray(j), dtype=torch.uint8)) for j in i] for i in batch['images']]
+            # Sort based on ViewPosition:
+            batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
+            batch['images'] = [torch.stack([test_transforms(j) for j in i]) for i in batch['images']]
+            max_size = max(i.shape[0] for i in batch['images'])
+            batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
+            batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
+            for k, v in self.tables.items():
+                if 'index_columns' in v or 'value_columns' in v:
+                    batch[f'{k}_index_value_feats'],  batch[f'{k}_index_value_token_type_ids'], batch[f'{k}_index_value_time_delta'], batch[f'{k}_index_value_mask'] = self.prepare_index_value_feats(k, batch)
+            for k, v in self.tables.items():
+                if 'text_columns' in v:
+                    for i in v['text_columns']:
+                        key = f'{k}_{i}' if not k == 'mimic_cxr_sectioned' else i
+                        batch[key], batch[f'{key}_time_delta'] = self.prepare_text_prompt(k, i, batch)
+            return batch
+        dataset = datasets.load_from_disk(dataset_path)
+        # Train set:
+        if not test_set_only:
+            with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{study_id_split}_train_study_ids.json'), 'r') as f:
+                study_ids = json.load(f)
+            train_set = dataset['train']
+            train_set_study_ids = train_set['study_id']
+            index_map = {study_id: idx for idx, study_id in enumerate(train_set_study_ids)}
+            indices = [index_map[study_id] for study_id in study_ids if study_id in index_map]
+            indices.sort()
+            train_set = PriorsDataset(train_set, self.decoder.config.history, self.time_delta_map)
+            train_set.set_transform(train_set_transform)
+            train_set = Subset(train_set, indices)
+        else:
+            train_set = None
+        # Validation set:
+        if not test_set_only:
+            with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{study_id_split}_validate_study_ids.json'), 'r') as f:
+                study_ids = json.load(f)
+            val_set = dataset['validate']
+            val_set_study_ids = val_set['study_id']
+            index_map = {study_id: idx for idx, study_id in enumerate(val_set_study_ids)}
+            indices = [index_map[study_id] for study_id in study_ids if study_id in index_map]
+            indices.sort()
+            val_set = PriorsDataset(val_set, self.decoder.config.history, self.time_delta_map)
+            val_set.set_transform(test_set_transform)
+            val_set = Subset(val_set, indices)
+        else:
+            val_set = None
+        # Test set:
+        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{study_id_split}_test_study_ids.json'), 'r') as f:
+            study_ids = json.load(f)
+        test_set = dataset['test']
+        test_set_study_ids = test_set['study_id']
+        index_map = {study_id: idx for idx, study_id in enumerate(test_set_study_ids)}
+        indices = [index_map[study_id] for study_id in study_ids if study_id in index_map]
+        indices.sort()
+        test_set = PriorsDataset(test_set, self.decoder.config.history, self.time_delta_map)
+        test_set.set_transform(test_set_transform)
+        test_set = Subset(test_set, indices)
+        return train_set, val_set, test_set
+    def get_stage_1_dataset(self, dataset_path, train_transforms, test_transforms, max_train_images_per_study):
+        def train_set_transform(batch):
+            # Randomly select max_train_images_per_study if the number of images for a study exceeds max_train_images_per_study.
+            for i in range(len(batch['images'])):
+                if len(batch['images'][i]) > max_train_images_per_study:
+                    paired = list(zip(batch['images'][i], batch['ViewPosition'][i]))
+                    sampled_pairs = random.sample(paired, max_train_images_per_study)
+                    batch['images'][i], batch['ViewPosition'][i] = zip(*sampled_pairs)
+            batch['images'] = [[decode_image(torch.frombuffer(bytearray(j), dtype=torch.uint8)) for j in i] for i in batch['images']]
+            # Sort based on ViewPosition:
+            batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
+            batch['images'] = [torch.stack([train_transforms(j) for j in i]) for i in batch['images']]
+            max_size = max(i.shape[0] for i in batch['images'])
+            batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
+            batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
+            return batch
+        def test_set_transform(batch):
+            batch['images'] = [[decode_image(torch.frombuffer(bytearray(j), dtype=torch.uint8)) for j in i] for i in batch['images']]
+            # Sort based on ViewPosition:
+            batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
+            batch['images'] = [torch.stack([test_transforms(j) for j in i]) for i in batch['images']]
+            max_size = max(i.shape[0] for i in batch['images'])
+            batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
+            batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
+            return batch
+        dataset = datasets.load_from_disk(dataset_path)
+        # Train set:
+        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), f'mimic_cxr_jpg_train_study_ids.json'), 'r') as f:
+            study_ids = json.load(f)
+        train_set = dataset['train']
+        train_set_study_ids = train_set['study_id']
+        index_map = {study_id: idx for idx, study_id in enumerate(train_set_study_ids)}
+        indices = [index_map[study_id] for study_id in study_ids if study_id in index_map]
+        indices.sort()
+        train_set = PriorsDataset(train_set, self.decoder.config.history, self.time_delta_map)
+        train_set.set_transform(train_set_transform)
+        train_set = Subset(train_set, indices)
+        # Validation set:
+        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), f'mimic_cxr_jpg_validate_study_ids.json'), 'r') as f:
+            study_ids = json.load(f)
+        val_set = dataset['validate']
+        val_set_study_ids = val_set['study_id']
+        index_map = {study_id: idx for idx, study_id in enumerate(val_set_study_ids)}
+        indices = [index_map[study_id] for study_id in study_ids if study_id in index_map]
+        indices.sort()
+        val_set = PriorsDataset(val_set, self.decoder.config.history, self.time_delta_map)
+        val_set.set_transform(test_set_transform)
+        val_set = Subset(val_set, indices)
+        # Test set:
+        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), f'mimic_cxr_jpg_test_study_ids.json'), 'r') as f:
+            study_ids = json.load(f)
+        test_set = dataset['test']
+        test_set_study_ids = test_set['study_id']
+        index_map = {study_id: idx for idx, study_id in enumerate(test_set_study_ids)}
+        indices = [index_map[study_id] for study_id in study_ids if study_id in index_map]
+        indices.sort()
+        test_set = PriorsDataset(test_set, self.decoder.config.history, self.time_delta_map)
+        test_set.set_transform(test_set_transform)
+        test_set = Subset(test_set, indices)
+        return train_set, val_set, test_set
+    def prepare_index_value_feats(self, table, batch):
+        index_value_columns = (self.tables[table].get('index_columns', []) + self.tables[table].get('value_columns', []))
+        index_value_columns = [f'{table}_{i}' for i in index_value_columns] if table != 'mimic_cxr_2_0_0_metadata' else index_value_columns
+        # Map to indices with lookup table:
+        if 'index_columns' in self.tables[table]:
+            for i in self.tables[table]['index_columns']:
+                k = f'{table}_{i}' if not table == 'mimic_cxr_2_0_0_metadata' else i
+                batch[k] = [
+                    [self.luts[table][i][str(k)] if k is not None else None for k in j] if j is not None else None for j in batch[k]
+                ]
+        batch_index_value_feats_list = []
+        batch_token_type_ids_list = []
+        batch_time_deltas_list = []
+        for batch_idx in range(len(batch['study_id'])):
+            if any([batch[k][batch_idx] for k in index_value_columns]):
+                num_rows = [len(batch[i][batch_idx]) for i in index_value_columns]
+                assert all(x == num_rows[0] for x in num_rows)
+                num_rows = num_rows[0]
+                # The y-index and the datetime for each group:
+                if isinstance(batch[self.tables[table]['groupby']][batch_idx], list):
+                    y_indices = [d.setdefault(x, len(d)) for d in [{}] for x in batch[self.tables[table]['groupby']][batch_idx]]
+                    datetime = [j for i, j in enumerate(batch[self.tables[table]['time_column']][batch_idx]) if j not in batch[self.tables[table]['time_column']][batch_idx][:i]]
+                    assert len(set(y_indices)) == len(datetime)
+                else:
+                    y_indices = [0] * num_rows
+                    datetime = batch[self.tables[table]['time_column']][batch_idx] if 'time_column' in self.tables[table] else [batch['latest_study_datetime'][batch_idx]]
+                time_deltas = torch.tensor([compute_time_delta(i, batch['latest_study_datetime'][batch_idx], self.time_delta_map, to_tensor=False) for i in datetime])[:, None]
+                tensor = torch.zeros(max(y_indices) + 1, self.luts[table]['total'])
+                # Index columns to feats:
+                if 'index_columns' in self.tables[table]:
+                    for i in self.tables[table]['index_columns']:
+                        k = f'{table}_{i}' if not table == 'mimic_cxr_2_0_0_metadata' else i
+                        y_indices_column = [y_idx for y_idx, x_idx in zip(y_indices, batch[k][batch_idx]) if x_idx is not None]
+                        x_indices_column = [x_idx for x_idx in batch[k][batch_idx] if x_idx is not None]
+                        tensor[y_indices_column, x_indices_column] = 1.0
+                if 'value_columns' in self.tables[table]:
+                    for i in self.tables[table]['value_columns']:
+                        k = f'{table}_{i}' if not table == 'mimic_cxr_2_0_0_metadata' else i
+                        y_indices_column = [y_idx for y_idx, value in zip(y_indices, batch[k][batch_idx]) if value is not None]
+                        x_indices_column = [self.luts[table][i] for value in batch[k][batch_idx] if value is not None]
+                        values = [value for value in batch[k][batch_idx] if value is not None]
+                        tensor[y_indices_column, x_indices_column] = torch.tensor(values, dtype=tensor.dtype)
+                        assert not torch.isnan(tensor).any()
+            else:
+                tensor = torch.empty(0, self.luts[table]['total'])
+                time_deltas = torch.empty(0, 1)
+            batch_index_value_feats_list.append(tensor)
+            batch_token_type_ids_list.append(torch.full(
+                    [tensor.shape[0]],
+                    self.token_type_to_token_type_id[table],
+                    dtype=torch.long,
+                )
             )
+            batch_time_deltas_list.append(time_deltas)
+            assert tensor.shape[0] == batch_token_type_ids_list[-1].shape[0]
+            assert tensor.shape[0] == time_deltas.shape[0]
+        batch_index_value_feats = torch.nn.utils.rnn.pad_sequence(batch_index_value_feats_list, batch_first=True, padding_value=-1)  # Pad value of -1 is not ideal. Need to use something else.
+        batch_token_type_ids = torch.nn.utils.rnn.pad_sequence(batch_token_type_ids_list, batch_first=True, padding_value=0)
+        batch_time_deltas = torch.nn.utils.rnn.pad_sequence(batch_time_deltas_list, batch_first=True, padding_value=0)
+        batch_mask = (batch_index_value_feats != -1).any(dim=-1).int()
+        return batch_index_value_feats, batch_token_type_ids, batch_time_deltas, batch_mask
+    def prepare_text_prompt(self, table, column, batch):
+        key = f'{table}_{column}' if not table == 'mimic_cxr_sectioned' else column
+        batch_text_list = []
+        batch_time_deltas_list = []
+        for batch_idx in range(len(batch['study_id'])):
+            if batch[key][batch_idx]:
+                num_rows = len(batch[key][batch_idx])
+                # The y-index and the datetime for each group:
+                if isinstance(batch[self.tables[table]['groupby']][batch_idx], list):
+                    y_indices = [d.setdefault(x, len(d)) for d in [{}] for x in batch[self.tables[table]['groupby']][batch_idx]]
+                    datetime = [j for i, j in enumerate(batch[self.tables[table]['time_column']][batch_idx]) if j not in batch[self.tables[table]['time_column']][batch_idx][:i]]
+                    assert len(set(y_indices)) == len(datetime)
+                else:
+                    y_indices = [0] * num_rows
+                    datetime = batch[self.tables[table]['time_column']][batch_idx] if 'time_column' in self.tables[table] else [batch['latest_study_datetime'][batch_idx]]
+                # Remove None values:
+                text_rows = batch[key][batch_idx] if isinstance(batch[key][batch_idx], list) else [batch[key][batch_idx]]
+                y_indices = [i for i, j in zip(y_indices, text_rows) if j is not None]
+                text_rows = [i for i in text_rows if i is not None]
+                datetime = [datetime[i] for i in set(y_indices)]
+                if text_rows:
+                    # Those in the same group (or those with the same y-index) get joined as the same string:
+                    batch_text_list.append([', '.join([text_rows[j] for j in range(len(y_indices)) if y_indices[j] == k]) + '.' for k in set(y_indices)])
+                    batch_time_deltas_list.append([compute_time_delta(i, batch['latest_study_datetime'][batch_idx], self.time_delta_map, to_tensor=False) for i in datetime])
+                    assert len(batch_time_deltas_list[-1]) == len(batch_text_list[-1])
+                else:
+                    batch_text_list.append([])
+                    batch_time_deltas_list.append([])
+            else:
+                batch_text_list.append([])
+                batch_time_deltas_list.append([])
+        return batch_text_list, batch_time_deltas_list
     @staticmethod
     def collate_fn(batch):
         keys = set().union(*(d.keys() for d in batch))
         batch = {j: [i.setdefault(j, None) for i in batch] for j in keys}
+        batch = {k: torch.stack(v) if isinstance(v[0], torch.Tensor) else v for k, v in batch.items()}
+        return batch
+    @staticmethod
+    def prepare_dataset(physionet_dir: str, database_dir: str):
+        prepare_dataset(physionet_dir=physionet_dir, database_dir=database_dir)

modelling_uniformer.py CHANGED Viewed

@@ -1,16 +1,17 @@
 from collections import OrderedDict
 from functools import partial
-from typing import Optional, Tuple, Union
 from math import isqrt
 import torch
 import torch.nn as nn
 from timm.models.layers import DropPath, to_2tuple, trunc_normal_
-from transformers import ViTConfig
 from transformers.modeling_outputs import ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
@@ -293,8 +294,7 @@ class UniFormerPreTrainedModel(PreTrainedModel):
     models.
     """
-    config_class = ViTConfig
-    base_model_prefix = "vit"
     main_input_name = "pixel_values"
     def _init_weights(self, m):

 from collections import OrderedDict
 from functools import partial
 from math import isqrt
+from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
 from timm.models.layers import DropPath, to_2tuple, trunc_normal_
 from transformers.modeling_outputs import ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
+from .configuration_uniformer import UniFormerWithProjectionHeadConfig
 logger = logging.get_logger(__name__)
     models.
     """
+    config_class = UniFormerWithProjectionHeadConfig
     main_input_name = "pixel_values"
     def _init_weights(self, m):

prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,558 @@

+import json
+import multiprocessing
+import os
+import re
+import shutil
+from glob import glob
+from pathlib import Path
+import datasets
+import duckdb
+import numpy as np
+import pandas as pd
+try:
+    from .create_section_files import create_section_files
+except ImportError:
+    from create_section_files import create_section_files
+def mimic_cxr_image_path(dir, subject_id, study_id, dicom_id, ext='dcm'):
+    return os.path.join(dir, 'p' + str(subject_id)[:2], 'p' + str(subject_id),
+                        's' + str(study_id), str(dicom_id) + '.' + ext)
+def format(text):
+    # Remove newline, tab, repeated whitespaces, and leading and trailing whitespaces:
+    def remove(text):
+        text = re.sub(r'\n|\t', ' ', text)
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+    if isinstance(text, np.ndarray) or isinstance(text, list):
+        return [remove(t) if not pd.isna(t) else t for t in text]
+    else:
+        if pd.isna(text):
+            return text
+        return remove(text)
+def create_lookup_table(df, columns, start_idx):
+    df = df.groupby(columns).head(1)[columns].sort_values(by=columns)
+    indices = range(start_idx, start_idx + len(df))
+    df['index'] = indices
+    return df, indices[-1]
+def lookup_tables(con, tables):
+    luts_dict = {}
+    for k, v in tables.items():
+        luts_dict[k] = {}
+        start_idx = 0
+        if 'index_columns' in v:
+            for i in v['index_columns']:
+                lut, end_idx = create_lookup_table(con.sql(f"SELECT {i} FROM {k}").df(), [i], start_idx)
+                start_idx = end_idx + 1
+                luts_dict[k][i] = {str(row[i]): int(row['index']) for _, row in lut.iterrows()}
+        if 'value_columns' in v:
+            for i in v['value_columns']:
+                luts_dict[k][i] = start_idx
+                start_idx += 1
+        luts_dict[k]['total'] = start_idx
+    with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'lookup_tables.json'), 'w') as file:
+        json.dump(luts_dict, file)
+def prepare_dataset(physionet_dir, database_dir, num_workers=None):
+    num_workers = num_workers if num_workers is not None else multiprocessing.cpu_count()
+    Path(database_dir).mkdir(parents=True, exist_ok=True)
+    sectioned_dir = os.path.join(database_dir, 'mimic_cxr_sectioned')
+    mimic_cxr_sectioned_path = os.path.join(sectioned_dir, 'mimic_cxr_sectioned.csv')
+    if not os.path.exists(mimic_cxr_sectioned_path):
+        print(f'{mimic_cxr_sectioned_path} does not exist, creating...')
+        # Check if reports exist. Reports for the first and last patients are checked only for speed, this comprimises comprehensiveness for speed:
+        report_paths = [
+            os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p10/p10000032/s50414267.txt'),
+            os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p10/p10000032/s53189527.txt'),
+            os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p10/p10000032/s53911762.txt'),
+            os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p10/p10000032/s56699142.txt'),
+            os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p19/p19999987/s55368167.txt'),
+            os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p19/p19999987/s58621812.txt'),
+            os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p19/p19999987/s58971208.txt'),
+        ]
+        assert all([os.path.isfile(i) for i in report_paths]), f"""The reports do not exist with the following regex: {os.path.join(physionet_dir, 'mimic-cxr/2.0.0/files/p1*/p1*/s*.txt')}.
+        "Please download them using wget -r -N -c -np --reject dcm --user <username> --ask-password https://physionet.org/files/mimic-cxr/2.0.0/"""
+        print('Extracting sections from reports...')
+        create_section_files(
+            reports_path=os.path.join(physionet_dir, 'mimic-cxr', '2.0.0', 'files'),
+            output_path=sectioned_dir,
+            no_split=True,
+        )
+    csv_paths = []
+    csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'edstays.csv.gz'))[0])
+    csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'medrecon.csv.gz'))[0])
+    csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'pyxis.csv.gz'))[0])
+    csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'triage.csv.gz'))[0])
+    csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'vitalsign.csv.gz'))[0])
+    base_names = [os.path.basename(i) for i in csv_paths]
+    for i in ['edstays.csv.gz', 'medrecon.csv.gz', 'pyxis.csv.gz', 'triage.csv.gz', 'vitalsign.csv.gz']:
+        assert i in base_names, f"""Table {i} is missing from MIMIC-IV-ED.
+            Please download the tables from https://physionet.org/content/mimic-iv-ed. Do not decompress them."""
+    csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-cxr-jpg', '*', 'mimic-cxr-2.0.0-metadata.csv.gz'))[0])
+    csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-cxr-jpg', '*', 'mimic-cxr-2.0.0-chexpert.csv.gz'))[0])
+    csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-cxr-jpg', '*', 'mimic-cxr-2.0.0-split.csv.gz'))[0])
+    base_names = [os.path.basename(i) for i in csv_paths[-3:]]
+    for i in ['mimic-cxr-2.0.0-metadata.csv.gz', 'mimic-cxr-2.0.0-chexpert.csv.gz', 'mimic-cxr-2.0.0-split.csv.gz']:
+        assert i in base_names, f"""CSV file {i} is missing from MIMIC-CXR-JPG.
+            Please download the tables from https://physionet.org/content/mimic-cxr-jpg. Do not decompress them."""
+    con = duckdb.connect(':memory:')
+    for i in csv_paths:
+        name = Path(i).stem.replace('.csv', '').replace('.gz', '').replace('-', '_').replace('.', '_')
+        print(f'Copying {name} into database...')
+        con.sql(f"CREATE OR REPLACE TABLE {name} AS FROM '{i}';")
+    # DuckDB has trouble reading the sectioned .csv file, read with pandas instead:
+    sections = pd.read_csv(mimic_cxr_sectioned_path)
+    # Remove the first character from the study column and rename it to study_id:
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE mimic_cxr_sectioned AS
+        SELECT *, CAST(SUBSTR(study, 2) AS INT32) AS study_id
+        FROM sections;
+        """
+    )
+    # Combine StudyDate and StudyTime into a single column and create the studies table:
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE studies AS
+        SELECT *,
+            strptime(
+                CAST(StudyDate AS VARCHAR) || ' ' || lpad(split_part(CAST(StudyTime AS VARCHAR), '.', 1), 6, '0'),
+                '%Y%m%d %H%M%S'
+            ) AS study_datetime
+        FROM mimic_cxr_2_0_0_metadata;
+        """
+    )
+    # Load the table configuration:
+    with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tables.json'), 'r') as file:
+        tables = json.load(file)
+    # Create lookup tables:
+    lookup_tables(con, tables)
+    # Collapse to one row per study, aggregate each studies columns as a list:
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE studies AS
+        SELECT
+            LIST(dicom_id) AS dicom_id,
+            FIRST(subject_id) AS subject_id,
+            study_id,
+            LIST(PerformedProcedureStepDescription) AS PerformedProcedureStepDescription,
+            LIST(ViewPosition) AS ViewPosition,
+            LIST(Rows) AS Rows,
+            LIST(Columns) AS Columns,
+            LIST(StudyDate) AS StudyDate,
+            LIST(StudyTime) AS StudyTime,
+            LIST(ProcedureCodeSequence_CodeMeaning) AS ProcedureCodeSequence_CodeMeaning,
+            LIST(ViewCodeSequence_CodeMeaning) AS ViewCodeSequence_CodeMeaning,
+            LIST(PatientOrientationCodeSequence_CodeMeaning) AS PatientOrientationCodeSequence_CodeMeaning,
+            LIST(study_datetime) AS study_datetime,
+            MAX(study_datetime) AS latest_study_datetime,
+        FROM studies
+        GROUP BY study_id;
+        """
+    )
+    # Join and filter the studies that overlap with ED stays:
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE studies AS
+        SELECT
+            s.*,
+            e.hadm_id,
+            e.stay_id,
+            e.intime,
+            e.outtime,
+        FROM studies s
+        LEFT JOIN edstays e
+        ON s.subject_id = e.subject_id
+        AND e.intime < s.latest_study_datetime
+        AND e.outtime > s.latest_study_datetime
+        AND s.study_id != 59128861;
+        """
+    )  # Don't join study 59128861 as it overlaps with two ED stays
+    # Aggregate and add the edstays table:
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE edstays_aggregated AS
+        SELECT
+            FIRST(subject_id) AS subject_id,
+            stay_id,
+            LIST(intime) AS intime,
+            LIST(outtime) AS outtime,
+            LIST(gender) AS gender,
+            LIST(race) AS race,
+            LIST(arrival_transport) AS arrival_transport,
+            LIST(disposition) AS disposition,
+        FROM edstays
+        GROUP BY stay_id;
+        """
+    )
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE studies AS
+        SELECT
+            s.*,
+            e.intime AS edstays_intime,
+            e.outtime AS edstays_outtime,
+            e.gender AS edstays_gender,
+            e.race AS edstays_race,
+            e.arrival_transport AS edstays_arrival_transport,
+            e.disposition AS edstays_disposition,
+        FROM studies s
+        LEFT JOIN edstays_aggregated e
+        ON s.stay_id = e.stay_id;
+        """
+    )
+    # Aggregate and add the triage table:
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE triage_aggregated AS
+        SELECT
+            FIRST(subject_id) AS subject_id,
+            stay_id,
+            LIST(temperature) as temperature,
+            LIST(heartrate) AS heartrate,
+            LIST(resprate) AS resprate,
+            LIST(o2sat) AS o2sat,
+            LIST(sbp) AS sbp,
+            LIST(dbp) AS dbp,
+            LIST(pain) AS pain,
+            LIST(acuity) AS acuity,
+            LIST(chiefcomplaint) AS chiefcomplaint,
+        FROM triage
+        GROUP BY stay_id;
+        """
+    )
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE studies AS
+        SELECT
+            s.*,
+            t.temperature AS triage_temperature,
+            t.heartrate AS triage_heartrate,
+            t.resprate AS triage_resprate,
+            t.o2sat AS triage_o2sat,
+            t.sbp AS triage_sbp,
+            t.dbp AS triage_dbp,
+            t.pain AS triage_pain,
+            t.acuity AS triage_acuity,
+            t.chiefcomplaint AS triage_chiefcomplaint,
+        FROM studies s
+        LEFT JOIN triage_aggregated t
+        ON s.stay_id = t.stay_id;
+        """
+    )
+    # Aggregate and then add the vitalsign table (ensuring no rows with a charttime after the latest study_datetime):
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE vitalsign_causal AS
+        SELECT v.*, s.latest_study_datetime, s.study_id,
+        FROM vitalsign v
+        JOIN studies s ON v.stay_id = s.stay_id
+        WHERE v.charttime < s.latest_study_datetime;
+        """
+    )  # This duplicates the rows for stay_ids that cover multiple study_ids. Hence, the following joins must be on study_id, not stay_id.
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE vitalsign_aggregated AS
+        SELECT
+            study_id,
+            FIRST(subject_id) AS subject_id,
+            FIRST(stay_id) as stay_id,
+            LIST(charttime) AS charttime,
+            LIST(temperature) as temperature,
+            LIST(heartrate) AS heartrate,
+            LIST(resprate) AS resprate,
+            LIST(o2sat) AS o2sat,
+            LIST(sbp) AS sbp,
+            LIST(dbp) AS dbp,
+            LIST(rhythm) AS rhythm,
+            LIST(pain) AS pain,
+        FROM vitalsign_causal
+        GROUP BY study_id;
+        """
+    )
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE studies AS
+        SELECT
+            s.*,
+            v.charttime AS vitalsign_charttime,
+            v.temperature AS vitalsign_temperature,
+            v.heartrate AS vitalsign_heartrate,
+            v.resprate AS vitalsign_resprate,
+            v.o2sat AS vitalsign_o2sat,
+            v.sbp AS vitalsign_sbp,
+            v.dbp AS vitalsign_dbp,
+            v.rhythm AS vitalsign_rhythm,
+            v.pain AS vitalsign_pain,
+        FROM studies s
+        LEFT JOIN vitalsign_aggregated v
+        ON s.study_id = v.study_id;
+        """
+    )
+    # Aggregate and then add the medrecon table:
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE medrecon_aggregated AS
+        SELECT
+            FIRST(subject_id) AS subject_id,
+            stay_id,
+            LIST(charttime) AS charttime,
+            LIST(name) as name,
+            LIST(gsn) AS gsn,
+            LIST(ndc) AS ndc,
+            LIST(etc_rn) AS etc_rn,
+            LIST(etccode) AS etccode,
+            LIST(etcdescription) AS etcdescription,
+        FROM medrecon
+        GROUP BY stay_id;
+        """
+    )
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE studies AS
+        SELECT
+            s.*,
+            m.charttime AS medrecon_charttime,
+            m.name AS medrecon_name,
+            m.gsn AS medrecon_gsn,
+            m.ndc AS medrecon_ndc,
+            m.etc_rn AS medrecon_etc_rn,
+            m.etccode AS medrecon_etccode,
+            m.etcdescription AS medrecon_etcdescription,
+        FROM studies s
+        LEFT JOIN medrecon_aggregated m
+        ON s.stay_id = m.stay_id;
+        """
+    )
+    # Aggregate and then add the pyxis table (ensuring no rows with a charttime after the latest study_datetime):
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE pyxis_causal AS
+        SELECT p.*, s.latest_study_datetime, s.study_id,
+        FROM pyxis p
+        JOIN studies s ON p.stay_id = s.stay_id
+        WHERE p.charttime < s.latest_study_datetime;
+        """
+    ) # This duplicates the rows for stay_ids that cover multiple study_ids. Hence, the following joins must be on study_id, not stay_id.
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE pyxis_aggregated AS
+        SELECT
+            study_id,
+            FIRST(subject_id) AS subject_id,
+            FIRST(stay_id) as stay_id,
+            LIST(charttime) AS charttime,
+            LIST(med_rn) as med_rn,
+            LIST(name) as name,
+            LIST(gsn_rn) AS gsn_rn,
+            LIST(gsn) AS gsn,
+        FROM pyxis_causal
+        GROUP BY study_id;
+        """
+    )
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE studies AS
+        SELECT
+            s.*,
+            p.charttime AS pyxis_charttime,
+            p.med_rn AS pyxis_med_rn,
+            p.name AS pyxis_name,
+            p.gsn_rn AS pyxis_gsn_rn,
+            p.gsn AS pyxis_gsn,
+        FROM studies s
+        LEFT JOIN pyxis_aggregated p
+        ON s.study_id = p.study_id;
+        """
+    )
+    # Add the reports:
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE studies AS
+        SELECT s.*, r.findings, r.impression, r.indication, r.history, r.comparison, r.last_paragraph, r.technique,
+        FROM studies s
+        LEFT JOIN mimic_cxr_sectioned r
+        ON s.study_id = r.study_id
+        """
+    )
+    # Aggregate and then add the splits:
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE split_aggregated AS
+        SELECT
+            study_id,
+            FIRST(split) AS split,
+        FROM mimic_cxr_2_0_0_split
+        GROUP BY study_id;
+        """
+    )
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE studies AS
+        SELECT s.*, x.split,
+        FROM studies s
+        JOIN split_aggregated x
+        ON s.study_id = x.study_id;
+        """
+    )
+    # Prior studies column:
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE prior_studies AS
+        WITH sorted AS (
+            SELECT *,
+                ROW_NUMBER() OVER (PARTITION BY subject_id ORDER BY latest_study_datetime) AS rn
+            FROM studies
+        ),
+        aggregated AS (
+            SELECT subject_id,
+                study_id,
+                latest_study_datetime,
+                ARRAY_AGG(study_id) OVER (PARTITION BY subject_id ORDER BY rn ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS prior_study_ids,
+                ARRAY_AGG(latest_study_datetime) OVER (PARTITION BY subject_id ORDER BY rn ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS prior_study_datetimes
+            FROM sorted
+        )
+        SELECT *
+        FROM aggregated;
+        """
+    )
+    con.sql(
+        """
+        CREATE OR REPLACE TABLE studies AS
+        SELECT s.*, p.prior_study_ids, p.prior_study_datetimes,
+        FROM studies s
+        LEFT JOIN prior_studies p
+        ON s.study_id = p.study_id
+        ORDER BY s.subject_id, s.study_datetime DESC;
+        """
+    )
+    # Text columns:
+    text_columns = [f'{k}_{j}' if k != 'mimic_cxr_sectioned' else j for k, v in tables.items() if 'text_columns' in v for j in (v['text_columns'] if isinstance(v['text_columns'], list) else [v['text_columns']])] + ['findings', 'impression']
+    pattern = os.path.join(physionet_dir, 'mimic-cxr-jpg', '*', 'files')
+    mimic_cxr_jpg_dir = glob(pattern)
+    assert len(mimic_cxr_jpg_dir), f'Multiple directories matched the pattern {pattern}: {mimic_cxr_jpg_dir}. Only one is required.'
+    mimic_cxr_jpg_dir = mimic_cxr_jpg_dir[0]
+    def load_image(row):
+        images = []
+        for dicom_ids, study_id, subject_id in zip(row['dicom_id'], row['study_id'], row['subject_id']):
+            study_images = []
+            for dicom_id in dicom_ids:
+                image_path = mimic_cxr_image_path(mimic_cxr_jpg_dir, subject_id, study_id, dicom_id, 'jpg')
+                with open(image_path, 'rb') as f:
+                    image = f.read()
+                study_images.append(image)
+            images.append(study_images)
+        row['images'] = images
+        return row
+    dataset_dict = {}
+    for split in ['test', 'validate', 'train']:
+        df = con.sql(f"FROM studies WHERE split = '{split}'").df()
+        # Format text columns:
+        for i in text_columns:
+            df[i] = df[i].apply(format)
+        # Save indices for each split:
+        df[df['findings'].notna() & df['impression'].notna()]['study_id'].to_json(
+            os.path.join(os.path.dirname(os.path.abspath(__file__)), f'mimic_cxr_jpg_{split}_study_ids.json'),
+            orient='records',
+            lines=False,
+        )
+        df_stay_id = df[df['findings'].notna() & df['impression'].notna() & df['stay_id'].notna()][['study_id', 'stay_id']]
+        df_stay_id['stay_id'] = df_stay_id['stay_id'].astype(int)
+        df_stay_id['study_id'].to_json(
+            os.path.join(os.path.dirname(os.path.abspath(__file__)), f'mimic_iv_ed_mimic_cxr_jpg_{split}_study_ids.json'),
+            orient='records',
+            lines=False,
+        )
+        if split == 'test':
+            pyxis_columns = [col for col in df.columns if col.startswith('pyxis_')]
+            df_pyxis = df[df['findings'].notna() & df['impression'].notna() & df['stay_id'].notna()]
+            df_pyxis = df_pyxis[~df_pyxis[pyxis_columns].isna().all(axis=1)]
+            df_pyxis['study_id'].to_json(
+                os.path.join(os.path.dirname(os.path.abspath(__file__)), f'mimic_iv_ed_mimic_cxr_jpg_pyxis_{split}_study_ids.json'),
+                orient='records',
+                lines=False,
+            )
+            vitalsign_columns = [col for col in df.columns if col.startswith('vitalsign_')]
+            df_vitalsign = df[df['findings'].notna() & df['impression'].notna() & df['stay_id'].notna()]
+            df_vitalsign = df_vitalsign[~df_vitalsign[vitalsign_columns].isna().all(axis=1)]
+            df_vitalsign['study_id'].to_json(
+                os.path.join(os.path.dirname(os.path.abspath(__file__)), f'mimic_iv_ed_mimic_cxr_jpg_vitalsign_{split}_study_ids.json'),
+                orient='records',
+                lines=False,
+            )
+    #     dataset_dict[split] = datasets.Dataset.from_pandas(df)
+    #     cache_dir = os.path.join(database_dir, '.cache')
+    #     Path(cache_dir).mkdir(parents=True, exist_ok=True)
+    #     dataset_dict[split] = dataset_dict[split].map(
+    #         load_image,
+    #         num_proc=num_workers,
+    #         writer_batch_size=8,
+    #         batched=True,
+    #         batch_size=8,
+    #         keep_in_memory=False,
+    #         cache_file_name=os.path.join(cache_dir, f'.{split}'),
+    #         load_from_cache_file=False,
+    #     )
+    #     dataset_dict[split].cleanup_cache_files()
+    #     shutil.rmtree(cache_dir)
+    # dataset = datasets.DatasetDict(dataset_dict)
+    # dataset.save_to_disk(os.path.join(database_dir, 'mimic_iv_ed_mimic_cxr_jpg_dataset'))
+    # con.close()
+if __name__ == "__main__":
+    physionet_dir = '/datasets/work/hb-mlaifsp-mm/work/archive/physionet.org/files'  # Where MIMIC-CXR, MIMIC-CXR-JPG, and MIMIC-IV-ED are stored.
+    database_dir = '/scratch3/nic261/database/cxrmate_ed'  # Where the resultant database will be stored.
+    prepare_dataset(physionet_dir=physionet_dir, database_dir=database_dir)

utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+def compute_time_delta(event_time, reference_time, time_delta_map, denominator = 3600, to_tensor=True):
+    """
+    How to we transform time delta inputs? It appears that minutes are used as the input to
+    a weight matrix in "Self-Supervised Transformer for Sparse and Irregularly Sampled Multivariate
+    Clinical Time-Series". This is almost confirmed by the CVE class defined here:
+    https://github.com/sindhura97/STraTS/blob/main/strats_notebook.ipynb, where the input has
+    a size of one.
+    """
+    time_delta = reference_time - event_time
+    time_delta = time_delta.total_seconds() / (denominator)
+    assert isinstance(time_delta, float), f'time_delta should be float, not {type(time_delta)}.'
+    if time_delta < 0:
+        raise ValueError(f'time_delta should be greater than or equal to zero, not {time_delta}.')
+    time_delta = time_delta_map(time_delta)
+    if to_tensor:
+        time_delta = torch.tensor(time_delta)
+    return time_delta