Spaces:

zy5830850
/

MedRPG

Sleeping

App Files Files Community

zy5830850 commited on Nov 30, 2023

Commit

91ef820

•

1 Parent(s): 232404e

First model version

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py.py +150 -0
images/649af982-e3af4e3a-75013d30-cdc71514-a34738fd.jpg +0 -0
med_rpg/__init__.py +17 -0
med_rpg/__pycache__/__init__.cpython-310.pyc +0 -0
med_rpg/__pycache__/data_loader.cpython-310.pyc +0 -0
med_rpg/__pycache__/data_loader.cpython-37.pyc +0 -0
med_rpg/__pycache__/engine.cpython-37.pyc +0 -0
med_rpg/__pycache__/med_rpg.cpython-310.pyc +0 -0
med_rpg/__pycache__/transforms.cpython-310.pyc +0 -0
med_rpg/__pycache__/transforms.cpython-37.pyc +0 -0
med_rpg/data/00363400-cee06fa7-8c2ca1f7-2678a170-b3a62a6e.jpg +0 -0
med_rpg/data/04e10148-c36f7afb-d0aaf964-152d8a5d-a02ab550.jpg +0 -0
med_rpg/data/1176839d-cf4f677f-d597a1ef-548bc32a-c05429f3.jpg +0 -0
med_rpg/data/13255e1f-91b7b172-02baaeee-340ec493-0e531681.jpg +0 -0
med_rpg/data/4b7f7a4c-18c39245-53724c25-06878595-7e41bb94.jpg +0 -0
med_rpg/data/649af982-e3af4e3a-75013d30-cdc71514-a34738fd.jpg +0 -0
med_rpg/data/95423e8e-45dff550-563d3eba-b8bc94be-a87f5a1d.jpg +0 -0
med_rpg/data_loader.py +376 -0
med_rpg/demo.py +222 -0
med_rpg/med_rpg.py +268 -0
med_rpg/models/MHA.py +467 -0
med_rpg/models/__init__.py +6 -0
med_rpg/models/__pycache__/MHA.cpython-310.pyc +0 -0
med_rpg/models/__pycache__/MHA.cpython-37.pyc +0 -0
med_rpg/models/__pycache__/__init__.cpython-310.pyc +0 -0
med_rpg/models/__pycache__/__init__.cpython-37.pyc +0 -0
med_rpg/models/__pycache__/trans_vg_ca.cpython-310.pyc +0 -0
med_rpg/models/__pycache__/trans_vg_ca.cpython-37.pyc +0 -0
med_rpg/models/__pycache__/vl_transformer.cpython-310.pyc +0 -0
med_rpg/models/__pycache__/vl_transformer.cpython-37.pyc +0 -0
med_rpg/models/language_model/__init__.py +0 -0
med_rpg/models/language_model/__pycache__/__init__.cpython-310.pyc +0 -0
med_rpg/models/language_model/__pycache__/__init__.cpython-37.pyc +0 -0
med_rpg/models/language_model/__pycache__/bert.cpython-310.pyc +0 -0
med_rpg/models/language_model/__pycache__/bert.cpython-37.pyc +0 -0
med_rpg/models/language_model/bert.py +63 -0
med_rpg/models/trans_vg_ca.py +88 -0
med_rpg/models/transformer.py +314 -0
med_rpg/models/visual_model/__init__.py +0 -0
med_rpg/models/visual_model/__pycache__/__init__.cpython-310.pyc +0 -0
med_rpg/models/visual_model/__pycache__/__init__.cpython-37.pyc +0 -0
med_rpg/models/visual_model/__pycache__/backbone.cpython-310.pyc +0 -0
med_rpg/models/visual_model/__pycache__/backbone.cpython-37.pyc +0 -0
med_rpg/models/visual_model/__pycache__/detr.cpython-310.pyc +0 -0
med_rpg/models/visual_model/__pycache__/detr.cpython-37.pyc +0 -0
med_rpg/models/visual_model/__pycache__/position_encoding.cpython-310.pyc +0 -0
med_rpg/models/visual_model/__pycache__/position_encoding.cpython-37.pyc +0 -0
med_rpg/models/visual_model/__pycache__/transformer.cpython-310.pyc +0 -0
med_rpg/models/visual_model/__pycache__/transformer.cpython-37.pyc +0 -0
med_rpg/models/visual_model/backbone.py +121 -0

app.py.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import gradio as gr
+import torch
+import sys
+# sys.path.insert(0, '/Users/daipl/Desktop/MedRPG_Demo/med_rpg')
+sys.path.insert(0, 'med_rpg')
+# import datasets
+from models import build_model
+from med_rpg import get_args_parser, medical_phrase_grounding
+import PIL.Image as Image
+from transformers import AutoTokenizer
+'''
+build args
+'''
+parser = get_args_parser()
+args = parser.parse_args()
+'''
+build model
+'''
+# device = 'cpu'
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# device = torch.device('mps')
+# Check that MPS is available
+# if not torch.backends.mps.is_available():
+#     if not torch.backends.mps.is_built():
+#         print("MPS not available because the current PyTorch install was not "
+#               "built with MPS enabled.")
+#     else:
+#         print("MPS not available because the current MacOS version is not 12.3+ "
+#               "and/or you do not have an MPS-enabled device on this machine.")
+# else:
+#     device = torch.device("mps")
+tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=True)
+## build model
+model = build_model(args)
+model.to(device)
+checkpoint = torch.load(args.eval_model, map_location='cpu')
+model.load_state_dict(checkpoint['model'], strict=False)
+'''
+inference model
+'''
+@torch.no_grad()
+def inference(image, text, bbox = [0, 0, 0, 0]):
+    image = image.convert("RGB")
+    # if bbox is not None:
+    # bbox = bbox.to_numpy(dtype='int')[0].tolist()
+    with torch.autocast(device_type='cuda', dtype=torch.float16):
+        return medical_phrase_grounding(model, tokenizer, image, text, bbox)
+# """
+#     Small left apical pneumothorax unchanged in size since ___:56 a.m.,
+#     and no appreciable left pleural effusion,
+#     basal pleural tubes still in place and reportedly on waterseal.
+#     Greater coalescence of consolidation in both the right mid and lower lung zones could be progressive atelectasis but is more concerning for pneumonia.
+#     Consolidation in the left lower lobe, however, has improved since ___ through ___.
+#     There is no right pleural effusion or definite right pneumothorax.
+#     Cardiomediastinal silhouette is normal.
+#     Distention of large and small bowel seen in the imaged portion of the upper abdomen is unchanged.
+# """
+def get_result(image, evt: gr.SelectData):
+    if evt.value:
+        bbox = evt.value[1][1:-1] # Remove "[" and "]"
+        bbox = [int(num) for num in bbox.split(",")]
+        output_img = inference(image, evt.value[0], bbox)
+        return evt.value[0], output_img
+GT_text = {
+    "Finding 1": "Small left apical pneumothorax",
+    "Finding 2": "Greater coalescence of consolidation in both the right mid and lower lung zones",
+    "Finding 3": "Consilidation in the left lower lobe"
+}
+# GT_bboxes = {"Finding 1": [1, 332, 28, 141, 48], "Finding 2": [2, 57, 177, 163, 165], "Finding 3": [3, 325, 231, 183, 132]}
+GT_bboxes = {"Finding 1": [1, 332, 28, 332+141, 28+48], "Finding 2": [2, 57, 177, 163+57, 165+177], "Finding 3": [3, 325, 231, 183+325, 132+231]}
+def get_new_result(image, evt: gr.SelectData):
+    if evt.value[1]:
+        if evt.value[0] == "(Show GT)":
+            bbox = GT_bboxes[evt.value[1]]
+            text = GT_text[evt.value[1]]
+        else:
+            bbox = [GT_bboxes[evt.value[1]][0], 0, 0, 0, 0]
+            text = evt.value[0]
+        output_img = inference(image, text, bbox)
+        return text, output_img
+def clear():
+    return ""
+with gr.Blocks() as demo:
+    gr.Markdown(
+    """
+    <center> <h1>Medical Phrase Grounding Demo</h1> </center>
+    <p style='text-align: center'> <a href='https://arxiv.org/abs/2303.07618' target='_blank'>Paper</a> </p>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1, min_width=300):
+            input_image = gr.Image(type='pil', value="./images/649af982-e3af4e3a-75013d30-cdc71514-a34738fd.jpg")
+            hl_text = gr.HighlightedText(
+                label="Medical Report",
+                combine_adjacent=False,
+                # combine_adjacent=True,
+                show_legend=False,
+                # value = [("Small left apical pneumothorax","[332, 28, 141, 48]"),
+                #         ("unchanged in size since ___:56 a.m., and no appreciable left pleural effusion, basal pleural tubes still in place and reportedly on waterseal.", None),
+                #         ("Greater coalescence of consolidation in both the right mid and lower lung zones","[57, 177, 163, 165]"),
+                #         ("could be progressive atelectasis but is more concerning for pneumonia.", None),
+                #         ("Consilidation in the left lower lobe","[325, 231, 183, 132]"),
+                #         (", however, has improved since ___ through ___.", None),
+                #         # ("There is no right pleural effusion or definite right pneumothorax.", None),
+                #         # ("Cardiomediastinal silhouette is normal.", None),
+                #         # ("Distention of large and small bowel seen in the imaged portion of the upper abdomen is unchanged.", None),
+                # ]
+                value = [("Small left apical pneumothorax","Finding 1"),
+                        ("(Show GT)","Finding 1"),
+                        ("unchanged in size since ___:56 a.m., and no appreciable left pleural effusion, basal pleural tubes still in place and reportedly on waterseal.", None),
+                        ("Greater coalescence of consolidation in both the right mid and lower lung zones","Finding 2"),
+                        ("(Show GT)","Finding 2"),
+                        ("could be progressive atelectasis but is more concerning for pneumonia.", None),
+                        ("Consilidation in the left lower lobe","Finding 3"),
+                        ("(Show GT)","Finding 3"),
+                        # ", however, has improved since ___ through ___.",
+                        (", however, has improved since ___ through ___.", None),
+                ]
+            )
+            input_text = gr.Textbox(label="Input Text", interactive=False)
+            # bbox = gr.Dataframe(
+            #     headers=["x", "y", "w", "h"],
+            #     datatype=["number", "number", "number", "number"],
+            #     label="Groud-Truth Bounding Box",
+            #     value=[[332, 28, 141, 48]]
+            # )
+            # with gr.Row():
+            #     clear_btn = gr.Button("Clear")
+            #     run_btn = gr.Button("Run")
+        # output = gr.Image(type="pil", label="Grounding Results", interactive=False).style(height=500)
+        output = gr.Image(type="pil", value="./images/649af982-e3af4e3a-75013d30-cdc71514-a34738fd.jpg", label="Grounding Results", interactive=False).style(height=500)
+        hl_text.select(get_new_result, inputs=[input_image], outputs=[input_text, output])
+        # run_btn.click(fn=inference, inputs=[input_image, input_text], outputs=output)
+        # clear_btn.click(fn=clear, outputs=input_text)
+demo.launch(share=True)

images/649af982-e3af4e3a-75013d30-cdc71514-a34738fd.jpg ADDED Viewed

med_rpg/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# import med_rpg.utils.misc as misc
+# from med_rpg.utils.box_utils import xywh2xyxy
+# from med_rpg.utils.visual_bbox import visualBBox
+# from med_rpg.models import build_model
+# from med_rpg.med_rpg import get_args_parser
+# import med_rpg.transforms as T
+# import med_rpg.utils.misc
+# import med_rpg.utils.misc as misc
+# from .open_inst import open_instseg
+# from .open_pano import open_panoseg
+# from .open_sem import open_semseg
+# from .ref_cap import referring_captioning
+# from .ref_in import referring_inpainting
+# from .ref_seg import referring_segmentation
+# from .text_ret import text_retrieval
+# from .reg_ret import region_retrieval

med_rpg/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (137 Bytes). View file

med_rpg/__pycache__/data_loader.cpython-310.pyc ADDED Viewed

Binary file (9.5 kB). View file

med_rpg/__pycache__/data_loader.cpython-37.pyc ADDED Viewed

Binary file (9.51 kB). View file

med_rpg/__pycache__/engine.cpython-37.pyc ADDED Viewed

Binary file (7.65 kB). View file

med_rpg/__pycache__/med_rpg.cpython-310.pyc ADDED Viewed

Binary file (6.68 kB). View file

med_rpg/__pycache__/transforms.cpython-310.pyc ADDED Viewed

Binary file (10.1 kB). View file

med_rpg/__pycache__/transforms.cpython-37.pyc ADDED Viewed

Binary file (10.8 kB). View file

med_rpg/data/00363400-cee06fa7-8c2ca1f7-2678a170-b3a62a6e.jpg ADDED Viewed

med_rpg/data/04e10148-c36f7afb-d0aaf964-152d8a5d-a02ab550.jpg ADDED Viewed

med_rpg/data/1176839d-cf4f677f-d597a1ef-548bc32a-c05429f3.jpg ADDED Viewed

med_rpg/data/13255e1f-91b7b172-02baaeee-340ec493-0e531681.jpg ADDED Viewed

med_rpg/data/4b7f7a4c-18c39245-53724c25-06878595-7e41bb94.jpg ADDED Viewed

med_rpg/data/649af982-e3af4e3a-75013d30-cdc71514-a34738fd.jpg ADDED Viewed

med_rpg/data/95423e8e-45dff550-563d3eba-b8bc94be-a87f5a1d.jpg ADDED Viewed

med_rpg/data_loader.py ADDED Viewed

	@@ -0,0 +1,376 @@

+# -*- coding: utf-8 -*-
+"""
+ReferIt, UNC, UNC+ and GRef referring image segmentation PyTorch dataset.
+Define and group batches of images, segmentations and queries.
+Based on:
+https://github.com/chenxi116/TF-phrasecut-public/blob/master/build_batches.py
+"""
+import os
+import re
+# import cv2
+import sys
+import json
+import torch
+import numpy as np
+import os.path as osp
+import scipy.io as sio
+import torch.utils.data as data
+sys.path.append('.')
+from PIL import Image
+from transformers import AutoTokenizer, AutoModel
+# from pytorch_pretrained_bert.tokenization import BertTokenizer
+# from transformers import BertTokenizer
+from utils.word_utils import Corpus
+from utils.box_utils import sampleNegBBox
+from utils.genome_utils import getCLSLabel
+def read_examples(input_line, unique_id):
+    """Read a list of `InputExample`s from an input file."""
+    examples = []
+    # unique_id = 0
+    line = input_line #reader.readline()
+    # if not line:
+    #     break
+    line = line.strip()
+    text_a = None
+    text_b = None
+    m = re.match(r"^(.*) \|\|\| (.*)$", line)
+    if m is None:
+        text_a = line
+    else:
+        text_a = m.group(1)
+        text_b = m.group(2)
+    examples.append(
+        InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
+    # unique_id += 1
+    return examples
+## Bert text encoding
+class InputExample(object):
+    def __init__(self, unique_id, text_a, text_b):
+        self.unique_id = unique_id
+        self.text_a = text_a
+        self.text_b = text_b
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
+        self.unique_id = unique_id
+        self.tokens = tokens
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.input_type_ids = input_type_ids
+def convert_examples_to_features(examples, seq_length, tokenizer, usemarker=None):
+    """Loads a data file into a list of `InputBatch`s."""
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        tokens_a = tokenizer.tokenize(example.text_a)
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
+        else:
+            if usemarker is not None:
+                # tokens_a = ['a', 'e', 'b', '*', 'c', 'd', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', '*', 'u']
+                marker_idx = [i for i,x in enumerate(tokens_a) if x=='*']
+                if marker_idx[1] > seq_length - 3 and len(tokens_a) - seq_length+1 < marker_idx[0]: #第二个*的下标不能大于17，且从后往前数第一个*不能溢出
+                    tokens_a = tokens_a[-(seq_length-2):]
+                    new_marker_idx = [i for i,x in enumerate(tokens_a) if x=='*']
+                    if len(new_marker_idx) < 2:  #说明第一个marker被删掉了
+                        pass
+                elif len(tokens_a) - seq_length+1 >= marker_idx[0]:
+                    max_len = min(marker_idx[1]-marker_idx[0]+1, seq_length-2)
+                    tokens_a = tokens_a[marker_idx[0]: marker_idx[0]+max_len]
+                    tokens_a[-1] = '*' #如果**的内容超出范围，强行把最后一位置为*
+                elif marker_idx[1]-marker_idx[0]<2:
+                    tokens_a = [i for i in tokens_a if i != '*']
+                    tokens_a = ['*'] + tokens_a + ['*'] #如果**连在一起，把**放到首尾两端
+                else:
+                    if len(tokens_a) > seq_length - 2:
+                        tokens_a = tokens_a[0:(seq_length - 2)]
+            else:
+                # Account for [CLS] and [SEP] with "- 2"
+                if len(tokens_a) > seq_length - 2:
+                    tokens_a = tokens_a[0:(seq_length - 2)]
+        tokens = []
+        input_type_ids = []
+        tokens.append("[CLS]")
+        input_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            input_type_ids.append(0)
+        tokens.append("[SEP]")
+        input_type_ids.append(0)
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                input_type_ids.append(1)
+            tokens.append("[SEP]")
+            input_type_ids.append(1)
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            input_type_ids.append(0)
+        assert len(input_ids) == seq_length
+        assert len(input_mask) == seq_length
+        assert len(input_type_ids) == seq_length
+        features.append(
+            InputFeatures(
+                unique_id=example.unique_id,
+                tokens=tokens,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                input_type_ids=input_type_ids))
+    return features
+class DatasetNotFoundError(Exception):
+    pass
+class TransVGDataset(data.Dataset):
+    SUPPORTED_DATASETS = {
+        'referit': {'splits': ('train', 'val', 'trainval', 'test')},
+        'unc': {
+            'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
+            'params': {'dataset': 'refcoco', 'split_by': 'unc'}
+        },
+        'unc+': {
+            'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
+            'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
+        },
+        'gref': {
+            'splits': ('train', 'val'),
+            'params': {'dataset': 'refcocog', 'split_by': 'google'}
+        },
+        'gref_umd': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'refcocog', 'split_by': 'umd'}
+        },
+        'flickr': {
+            'splits': ('train', 'val', 'test')
+        },
+        'MS_CXR': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'MS_CXR', 'split_by': 'MS_CXR'}
+        },
+        'ChestXray8': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'ChestXray8', 'split_by': 'ChestXray8'}
+        },
+        'SGH_CXR_V1': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'SGH_CXR_V1', 'split_by': 'SGH_CXR_V1'}
+        }
+    }
+    def __init__(self, args, data_root, split_root='data', dataset='referit',
+                 transform=None, return_idx=False, testmode=False,
+                 split='train', max_query_len=128, lstm=False,
+                 bert_model='bert-base-uncased'):
+        self.images = []
+        self.data_root = data_root
+        self.split_root = split_root
+        self.dataset = dataset
+        self.query_len = max_query_len
+        self.lstm = lstm
+        self.transform = transform
+        self.testmode = testmode
+        self.split = split
+        self.tokenizer = AutoTokenizer.from_pretrained(bert_model, do_lower_case=True)
+        self.return_idx=return_idx
+        self.args = args
+        self.ID_Categories = {1: 'Cardiomegaly', 2: 'Lung Opacity', 3:'Edema', 4: 'Consolidation', 5: 'Pneumonia', 6:'Atelectasis', 7: 'Pneumothorax', 8:'Pleural Effusion'}
+        assert self.transform is not None
+        if split == 'train':
+            self.augment = True
+        else:
+            self.augment = False
+        if self.dataset == 'MS_CXR':
+            self.dataset_root = osp.join(self.data_root, 'MS_CXR')
+            self.im_dir = self.dataset_root  # 具体的图片路径保存在split中
+        elif self.dataset == 'ChestXray8':
+            self.dataset_root = osp.join(self.data_root, 'ChestXray8')
+            self.im_dir = self.dataset_root  # 具体的图片路径保存在split中
+        elif self.dataset == 'SGH_CXR_V1':
+            self.dataset_root = osp.join(self.data_root, 'SGH_CXR_V1')
+            self.im_dir = self.dataset_root  # 具体的图片路径保存在split中
+        elif self.dataset == 'referit':
+            self.dataset_root = osp.join(self.data_root, 'referit')
+            self.im_dir = osp.join(self.dataset_root, 'images')
+            self.split_dir = osp.join(self.dataset_root, 'splits')
+        elif  self.dataset == 'flickr':
+            self.dataset_root = osp.join(self.data_root, 'Flickr30k')
+            self.im_dir = osp.join(self.dataset_root, 'flickr30k_images')
+        else:   ## refcoco, etc.
+            self.dataset_root = osp.join(self.data_root, 'other')
+            self.im_dir = osp.join(
+                self.dataset_root, 'images', 'mscoco', 'images', 'train2014')
+            self.split_dir = osp.join(self.dataset_root, 'splits')
+        if not self.exists_dataset():
+            # self.process_dataset()
+            print('Please download index cache to data folder: \n \
+                https://drive.google.com/open?id=1cZI562MABLtAzM6YU4WmKPFFguuVr0lZ')
+            exit(0)
+        dataset_path = osp.join(self.split_root, self.dataset)
+        valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
+        if self.lstm:
+            self.corpus = Corpus()
+            corpus_path = osp.join(dataset_path, 'corpus.pth')
+            self.corpus = torch.load(corpus_path)
+        if split not in valid_splits:
+            raise ValueError(
+                'Dataset {0} does not have split {1}'.format(
+                    self.dataset, split))
+        splits = [split]
+        if self.dataset != 'referit':
+            splits = ['train', 'val'] if split == 'trainval' else [split]
+        for split in splits:
+            imgset_file = '{0}_{1}.pth'.format(self.dataset, split)
+            imgset_path = osp.join(dataset_path, imgset_file)
+            self.images += torch.load(imgset_path)
+    def exists_dataset(self):
+        return osp.exists(osp.join(self.split_root, self.dataset))
+    def pull_item(self, idx):
+        info = {}
+        if self.dataset == 'MS_CXR':
+            # anno_id, image_id, category_id, img_file, bbox, width, height, phrase, phrase_marker = self.images[idx]  # 核心三要素 img_file, bbox, phrase
+            anno_id, image_id, category_id, img_file, bbox, width, height, phrase = self.images[idx]  # 核心三要素 img_file, bbox, phrase
+            info['anno_id'] = anno_id
+            info['category_id'] = category_id
+        elif self.dataset == 'ChestXray8':
+            anno_id, image_id, category_id, img_file, bbox, phrase, prompt_text = self.images[idx]  # 核心三要素 img_file, bbox, phrase
+            info['anno_id'] = anno_id
+            info['category_id'] = category_id
+            # info['img_file'] = img_file
+        elif self.dataset == 'SGH_CXR_V1':
+            anno_id, image_id, category_id, img_file, bbox, phrase, patient_id = self.images[idx]  # 核心三要素 img_file, bbox, phrase
+            info['anno_id'] = anno_id
+            info['category_id'] = category_id
+        elif self.dataset == 'flickr':
+            img_file, bbox, phrase = self.images[idx]
+        else:
+            img_file, _, bbox, phrase, attri = self.images[idx]
+        ## box format: to x1y1x2y2
+        if not (self.dataset == 'referit' or self.dataset == 'flickr'):
+            bbox = np.array(bbox, dtype=int)
+            bbox[2], bbox[3] = bbox[0]+bbox[2], bbox[1]+bbox[3]
+        else:
+            bbox = np.array(bbox, dtype=int)
+        # img_file = 'files/p12/p12423759/s53349935/b8c7a778-2f7f712d-5c598645-6aeebbb3-66ffbcc7.jpg'  # Experiments @fixImage
+        if self.args.ablation == 'onlyText':
+            img_file = 'files/p12/p12423759/s53349935/b8c7a778-2f7f712d-5c598645-6aeebbb3-66ffbcc7.jpg'
+        img_path = osp.join(self.im_dir, img_file)
+        info['img_path'] = img_path
+        img = Image.open(img_path).convert("RGB")
+        # img = cv2.imread(img_path)
+        # ## duplicate channel if gray image
+        # if img.shape[-1] > 1:
+        #     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        # else:
+        #     img = np.stack([img] * 3)
+        bbox = torch.tensor(bbox)
+        bbox = bbox.float()
+        # info['phrase_marker'] = phrase_marker
+        return img, phrase, bbox, info
+    def tokenize_phrase(self, phrase):
+        return self.corpus.tokenize(phrase, self.query_len)
+    def untokenize_word_vector(self, words):
+        return self.corpus.dictionary[words]
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, idx):
+        img, phrase, bbox, info = self.pull_item(idx)
+        # phrase = phrase.decode("utf-8").encode().lower()
+        phrase = phrase.lower()
+        if hasattr(self.args, 'CATextPoolType') and self.args.CATextPoolType == 'marker':
+            # TODO
+            phrase = info['phrase_marker']
+        info['phrase_record'] = phrase  # for visualization  # info: img_path, phrase_record, anno_id, category_id
+        input_dict = {'img': img, 'box': bbox, 'text': phrase}
+        if self.args.model_name == 'TransVG_ca' and self.split == 'train':
+            NegBBoxs = sampleNegBBox(bbox, self.args.CAsampleType, self.args.CAsampleNum)  # negative bbox
+            input_dict = {'img': img, 'box': bbox, 'text': phrase, 'NegBBoxs': NegBBoxs}
+        if self.args.model_name == 'TransVG_gn' and self.split == 'train':
+            json_name = os.path.splitext(os.path.basename(info['img_path']))[0]+'_SceneGraph.json'
+            json_name = os.path.join(self.args.GNpath, json_name)
+            # 解析json, 得到所有的anatomy-level的分类label
+            gnLabel = getCLSLabel(json_name, bbox)
+            info['gnLabel'] = gnLabel
+        input_dict = self.transform(input_dict)
+        img = input_dict['img']
+        bbox = input_dict['box']
+        phrase = input_dict['text']
+        img_mask = input_dict['mask']
+        if self.args.model_name == 'TransVG_ca' and self.split == 'train':
+            info['NegBBoxs'] = [np.array(negBBox, dtype=np.float32) for negBBox in input_dict['NegBBoxs']]
+        if self.lstm:
+            phrase = self.tokenize_phrase(phrase)
+            word_id = phrase
+            word_mask = np.array(word_id>0, dtype=int)
+        else:
+            ## encode phrase to bert input
+            examples = read_examples(phrase, idx)
+            if hasattr(self.args, 'CATextPoolType') and self.args.CATextPoolType == 'marker':
+                use_marker = 'yes'
+            else:
+                use_marker = None
+            features = convert_examples_to_features(
+                examples=examples, seq_length=self.query_len, tokenizer=self.tokenizer, usemarker=use_marker)
+            word_id = features[0].input_ids
+            word_mask = features[0].input_mask
+            if self.args.ablation == 'onlyImage':
+                word_mask = [0] * word_mask.__len__()  # experiments @2
+            # if self.args.ablation == 'onlyText':
+            #     img_mask = np.ones_like(np.array(img_mask))
+        if self.testmode:
+            return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
+                np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \
+                np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0]
+        else:
+            return img, np.array(img_mask), np.array(word_id, dtype=int), np.array(word_mask, dtype=int), np.array(bbox, dtype=np.float32), info

med_rpg/demo.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import argparse
+import numpy as np
+import torch
+# import datasets
+import utils.misc as misc
+from utils.box_utils import xywh2xyxy
+from utils.visual_bbox import visualBBox
+from models import build_model
+import transforms as T
+import PIL.Image as Image
+import data_loader
+from transformers import AutoTokenizer
+def get_args_parser():
+    parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
+    # Input config
+    # parser.add_argument('--image', type=str, default='xxx', help="input X-ray image.")
+    # parser.add_argument('--phrase', type=str, default='xxx', help="input phrase.")
+    # parser.add_argument('--bbox', type=str, default='xxx', help="alternative, if you want to show ground-truth bbox")
+    # fool
+    parser.add_argument('--lr', default=1e-4, type=float)
+    parser.add_argument('--lr_bert', default=0., type=float)
+    parser.add_argument('--lr_visu_cnn', default=0., type=float)
+    parser.add_argument('--lr_visu_tra', default=1e-5, type=float)
+    parser.add_argument('--batch_size', default=32, type=int)
+    parser.add_argument('--weight_decay', default=1e-4, type=float)
+    parser.add_argument('--epochs', default=100, type=int)
+    parser.add_argument('--lr_power', default=0.9, type=float, help='lr poly power')
+    parser.add_argument('--clip_max_norm', default=0., type=float,
+                        help='gradient clipping max norm')
+    parser.add_argument('--eval', dest='eval', default=False, action='store_true', help='if evaluation only')
+    parser.add_argument('--optimizer', default='rmsprop', type=str)
+    parser.add_argument('--lr_scheduler', default='poly', type=str)
+    parser.add_argument('--lr_drop', default=80, type=int)
+    # Model parameters
+    parser.add_argument('--model_name', type=str, default='TransVG_ca',
+                        help="Name of model to be exploited.")
+    # Transformers in two branches
+    parser.add_argument('--bert_enc_num', default=12, type=int)
+    parser.add_argument('--detr_enc_num', default=6, type=int)
+    # DETR parameters
+    # * Backbone
+    parser.add_argument('--backbone', default='resnet50', type=str,
+                        help="Name of the convolutional backbone to use")
+    parser.add_argument('--dilation', action='store_true',
+                        help="If true, we replace stride with dilation in the last convolutional block (DC5)")
+    parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'), help="Type of positional embedding to use on top of the image features")
+    # * Transformer
+    parser.add_argument('--enc_layers', default=6, type=int,
+                        help="Number of encoding layers in the transformer")
+    parser.add_argument('--dec_layers', default=0, type=int,
+                        help="Number of decoding layers in the transformer")
+    parser.add_argument('--dim_feedforward', default=2048, type=int,
+                        help="Intermediate size of the feedforward layers in the transformer blocks")
+    parser.add_argument('--hidden_dim', default=256, type=int,
+                        help="Size of the embeddings (dimension of the transformer)")
+    parser.add_argument('--dropout', default=0.1, type=float,
+                        help="Dropout applied in the transformer")
+    parser.add_argument('--nheads', default=8, type=int,
+                        help="Number of attention heads inside the transformer's attentions")
+    parser.add_argument('--num_queries', default=100, type=int,
+                        help="Number of query slots")
+    parser.add_argument('--pre_norm', action='store_true')
+    parser.add_argument('--imsize', default=640, type=int, help='image size')
+    parser.add_argument('--emb_size', default=512, type=int,
+                        help='fusion module embedding dimensions')
+    # Vision-Language Transformer
+    parser.add_argument('--use_vl_type_embed', action='store_true',
+                        help="If true, use vl_type embedding")
+    parser.add_argument('--vl_dropout', default=0.1, type=float,
+                        help="Dropout applied in the vision-language transformer")
+    parser.add_argument('--vl_nheads', default=8, type=int,
+                        help="Number of attention heads inside the vision-language transformer's attentions")
+    parser.add_argument('--vl_hidden_dim', default=256, type=int,
+                        help='Size of the embeddings (dimension of the vision-language transformer)')
+    parser.add_argument('--vl_dim_feedforward', default=2048, type=int,
+                        help="Intermediate size of the feedforward layers in the vision-language transformer blocks")
+    parser.add_argument('--vl_enc_layers', default=6, type=int,
+                        help='Number of encoders in the vision-language transformer')
+    # Dataset parameters
+    # parser.add_argument('--data_root', type=str, default='./ln_data/',
+    #                     help='path to ReferIt splits data folder')
+    # parser.add_argument('--split_root', type=str, default='data',
+    #                     help='location of pre-parsed dataset info')
+    parser.add_argument('--dataset', default='MS_CXR', type=str,
+                        help='referit/flickr/unc/unc+/gref')
+    parser.add_argument('--max_query_len', default=20, type=int,
+                        help='maximum time steps (lang length) per batch')
+    # dataset parameters
+    parser.add_argument('--output_dir', default='outputs',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    # parser.add_argument('--seed', default=13, type=int)
+    # parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--detr_model', default='./saved_models/detr-r50.pth', type=str, help='detr model')
+    parser.add_argument('--bert_model', default='bert-base-uncased', type=str, help='bert model')
+    # parser.add_argument('--light', dest='light', default=False, action='store_true', help='if use smaller model')
+    # parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+    #                     help='start epoch')
+    # parser.add_argument('--num_workers', default=2, type=int)
+    # distributed training parameters
+    # parser.add_argument('--world_size', default=1, type=int,
+    #                     help='number of distributed processes')
+    # parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+    # evalutaion options
+    # parser.add_argument('--eval_set', default='test', type=str)
+    parser.add_argument('--eval_model', default='checkpoint/best_miou_checkpoint.pth', type=str)
+    # visualization options
+    # parser.add_argument('--visualization', action='store_true',
+    #                     help="If true, visual the bbox")
+    # parser.add_argument('--visual_MHA', action='store_true',
+    #                     help="If true, visual the attention maps")
+    return parser
+def make_transforms(imsize):
+    return T.Compose([
+            T.RandomResize([imsize]),
+            T.ToTensor(),
+            T.NormalizeAndPad(size=imsize),
+        ])
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    image_size = 640 # hyper parameters
+    ## build data
+    # case1
+    img_path = "data/649af982-e3af4e3a-75013d30-cdc71514-a34738fd.jpg"
+    phrase = 'Small left apical pneumothorax'
+    bbox = [332, 28, 141, 48]  # xywh
+    # # case2
+    # img_path = 'files/p10/p10977201/s59062881/00363400-cee06fa7-8c2ca1f7-2678a170-b3a62a6e.jpg'
+    # phrase = 'small apical pneumothorax'
+    # bbox = [161, 134, 111, 37]
+    # # case3
+    # img_path = 'files/p18/p18426683/s59612243/95423e8e-45dff550-563d3eba-b8bc94be-a87f5a1d.jpg'
+    # phrase = 'cardiac silhouette enlarged'
+    # bbox = [196, 312, 371, 231]
+    # # case4
+    # img_path = 'files/p10/p10048451/s53489305/4b7f7a4c-18c39245-53724c25-06878595-7e41bb94.jpg'
+    # phrase = 'Focal opacity in the lingular lobe'
+    # bbox = [467, 373, 131, 189]
+    # # case5
+    # img_path = 'files/p19/p19757720/s59572378/13255e1f-91b7b172-02baaeee-340ec493-0e531681.jpg'
+    # phrase = 'multisegmental right upper lobe consolidation is present'
+    # bbox = [9, 86, 232, 278]
+    # # case6
+    # img_path = 'files/p10/p10469621/s56786891/04e10148-c36f7afb-d0aaf964-152d8a5d-a02ab550.jpg'
+    # phrase = 'right middle lobe opacity, suspicious for pneumonia in the proper clinical setting'
+    # bbox = [108, 405, 162, 83]
+    # # case7
+    # img_path = 'files/p10/p10670818/s50191454/1176839d-cf4f677f-d597a1ef-548bc32a-c05429f3.jpg'
+    # phrase = 'Newly appeared lingular opacity'
+    # bbox = [392, 297, 141, 151]
+    bbox = bbox[:2] + [bbox[0]+bbox[2], bbox[1]+bbox[3]] # xywh2xyxy
+    ## encode phrase to bert input
+    examples = data_loader.read_examples(phrase, 1)
+    tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=True)
+    features = data_loader.convert_examples_to_features(
+        examples=examples, seq_length=args.max_query_len, tokenizer=tokenizer, usemarker=None)
+    word_id = torch.tensor(features[0].input_ids)  #
+    word_mask = torch.tensor(features[0].input_mask)  #
+    ## read and transform image
+    input_dict = dict()
+    img = Image.open(img_path).convert("RGB")
+    input_dict['img'] = img
+    fake_bbox = torch.tensor(np.array([0,0,0,0], dtype=int)).float() #for avoid bug
+    input_dict['box'] = fake_bbox #for avoid bug
+    input_dict['text'] = phrase
+    transform = make_transforms(imsize=image_size)
+    input_dict = transform(input_dict)
+    img = input_dict['img']  #
+    img_mask = input_dict['mask']  #
+    # if bbox is not None:
+    #     bbox = input_dict['box']  #
+    img_data = misc.NestedTensor(img.unsqueeze(0), img_mask.unsqueeze(0))
+    text_data = misc.NestedTensor(word_id.unsqueeze(0), word_mask.unsqueeze(0))
+    ## build model
+    model = build_model(args)
+    model.to(device)
+    checkpoint = torch.load(args.eval_model, map_location='cpu')
+    model.load_state_dict(checkpoint['model'])
+    ## model infer
+    img_data = img_data.to(device)
+    text_data = text_data.to(device)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(img_data, text_data)
+        pred_box = outputs['pred_box']
+        pred_box = xywh2xyxy(pred_box.detach().cpu())*image_size
+        pred_box = pred_box.numpy()[0]
+        pred_box = [round(pred_box[0]), round(pred_box[1]), round(pred_box[2]), round(pred_box[3])]
+        visualBBox(img_path, pred_box, bbox)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('TransVG evaluation script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    main(args)

med_rpg/med_rpg.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import argparse
+import numpy as np
+import torch
+# import datasets
+import utils.misc as misc
+from utils.box_utils import xywh2xyxy
+from utils.visual_bbox import visualBBox
+# from models import build_model
+import transforms as T
+import PIL.Image as Image
+import data_loader
+from transformers import AutoTokenizer
+def get_args_parser():
+    parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
+    # Input config
+    # parser.add_argument('--image', type=str, default='xxx', help="input X-ray image.")
+    # parser.add_argument('--phrase', type=str, default='xxx', help="input phrase.")
+    # parser.add_argument('--bbox', type=str, default='xxx', help="alternative, if you want to show ground-truth bbox")
+    # fool
+    parser.add_argument('--lr', default=1e-4, type=float)
+    parser.add_argument('--lr_bert', default=0., type=float)
+    parser.add_argument('--lr_visu_cnn', default=0., type=float)
+    parser.add_argument('--lr_visu_tra', default=1e-5, type=float)
+    parser.add_argument('--batch_size', default=32, type=int)
+    parser.add_argument('--weight_decay', default=1e-4, type=float)
+    parser.add_argument('--epochs', default=100, type=int)
+    parser.add_argument('--lr_power', default=0.9, type=float, help='lr poly power')
+    parser.add_argument('--clip_max_norm', default=0., type=float,
+                        help='gradient clipping max norm')
+    parser.add_argument('--eval', dest='eval', default=False, action='store_true', help='if evaluation only')
+    parser.add_argument('--optimizer', default='rmsprop', type=str)
+    parser.add_argument('--lr_scheduler', default='poly', type=str)
+    parser.add_argument('--lr_drop', default=80, type=int)
+    # Model parameters
+    parser.add_argument('--model_name', type=str, default='TransVG_ca',
+                        help="Name of model to be exploited.")
+    # Transformers in two branches
+    parser.add_argument('--bert_enc_num', default=12, type=int)
+    parser.add_argument('--detr_enc_num', default=6, type=int)
+    # DETR parameters
+    # * Backbone
+    parser.add_argument('--backbone', default='resnet50', type=str,
+                        help="Name of the convolutional backbone to use")
+    parser.add_argument('--dilation', action='store_true',
+                        help="If true, we replace stride with dilation in the last convolutional block (DC5)")
+    parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'), help="Type of positional embedding to use on top of the image features")
+    # * Transformer
+    parser.add_argument('--enc_layers', default=6, type=int,
+                        help="Number of encoding layers in the transformer")
+    parser.add_argument('--dec_layers', default=0, type=int,
+                        help="Number of decoding layers in the transformer")
+    parser.add_argument('--dim_feedforward', default=2048, type=int,
+                        help="Intermediate size of the feedforward layers in the transformer blocks")
+    parser.add_argument('--hidden_dim', default=256, type=int,
+                        help="Size of the embeddings (dimension of the transformer)")
+    parser.add_argument('--dropout', default=0.1, type=float,
+                        help="Dropout applied in the transformer")
+    parser.add_argument('--nheads', default=8, type=int,
+                        help="Number of attention heads inside the transformer's attentions")
+    parser.add_argument('--num_queries', default=100, type=int,
+                        help="Number of query slots")
+    parser.add_argument('--pre_norm', action='store_true')
+    parser.add_argument('--imsize', default=640, type=int, help='image size')
+    parser.add_argument('--emb_size', default=512, type=int,
+                        help='fusion module embedding dimensions')
+    # Vision-Language Transformer
+    parser.add_argument('--use_vl_type_embed', action='store_true',
+                        help="If true, use vl_type embedding")
+    parser.add_argument('--vl_dropout', default=0.1, type=float,
+                        help="Dropout applied in the vision-language transformer")
+    parser.add_argument('--vl_nheads', default=8, type=int,
+                        help="Number of attention heads inside the vision-language transformer's attentions")
+    parser.add_argument('--vl_hidden_dim', default=256, type=int,
+                        help='Size of the embeddings (dimension of the vision-language transformer)')
+    parser.add_argument('--vl_dim_feedforward', default=2048, type=int,
+                        help="Intermediate size of the feedforward layers in the vision-language transformer blocks")
+    parser.add_argument('--vl_enc_layers', default=6, type=int,
+                        help='Number of encoders in the vision-language transformer')
+    # Dataset parameters
+    # parser.add_argument('--data_root', type=str, default='./ln_data/',
+    #                     help='path to ReferIt splits data folder')
+    # parser.add_argument('--split_root', type=str, default='data',
+    #                     help='location of pre-parsed dataset info')
+    parser.add_argument('--dataset', default='MS_CXR', type=str,
+                        help='referit/flickr/unc/unc+/gref')
+    parser.add_argument('--max_query_len', default=20, type=int,
+                        help='maximum time steps (lang length) per batch')
+    # dataset parameters
+    parser.add_argument('--output_dir', default='outputs',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    # parser.add_argument('--seed', default=13, type=int)
+    # parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--detr_model', default='./saved_models/detr-r50.pth', type=str, help='detr model')
+    parser.add_argument('--bert_model', default='bert-base-uncased', type=str, help='bert model')
+    # parser.add_argument('--light', dest='light', default=False, action='store_true', help='if use smaller model')
+    # parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+    #                     help='start epoch')
+    # parser.add_argument('--num_workers', default=2, type=int)
+    # distributed training parameters
+    # parser.add_argument('--world_size', default=1, type=int,
+    #                     help='number of distributed processes')
+    # parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+    # evalutaion options
+    # parser.add_argument('--eval_set', default='test', type=str)
+    parser.add_argument('--eval_model', default='med_rpg/checkpoint/best_miou_checkpoint.pth', type=str)
+    # visualization options
+    # parser.add_argument('--visualization', action='store_true',
+    #                     help="If true, visual the bbox")
+    # parser.add_argument('--visual_MHA', action='store_true',
+    #                     help="If true, visual the attention maps")
+    return parser
+def make_transforms(imsize):
+    return T.Compose([
+            T.RandomResize([imsize]),
+            T.ToTensor(),
+            T.NormalizeAndPad(size=imsize),
+        ])
+def medical_phrase_grounding(model, tokenizer, orig_img, text, bbox = None):
+    image_size = 640 # hyper parameters
+    max_query_len = 20
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # device = torch.device("mps")
+    if bbox is not None:
+        # bbox = bbox[:2] + [bbox[0]+bbox[2], bbox[1]+bbox[3]] # xywh2xyxy
+        # bbox[1:] = bbox[1:3] + [bbox[1]+bbox[3], bbox[2]+bbox[4]] # xywh2xyxy
+        # bbox[2] = bbox[0] + bbox[2]
+        # bbox[3] = bbox[1] + bbox[3] # xywh2xyxy
+        ## encode phrase to bert input
+        examples = data_loader.read_examples(text, 1)
+        features = data_loader.convert_examples_to_features(
+            examples=examples, seq_length=max_query_len, tokenizer=tokenizer, usemarker=None)
+        word_id = torch.tensor(features[0].input_ids)  #
+        word_mask = torch.tensor(features[0].input_mask)  #
+        ## read and transform image
+        input_dict = dict()
+        input_dict['img'] = orig_img
+        fake_bbox = torch.tensor(np.array([0,0,0,0], dtype=int)).float() #for avoid bug
+        input_dict['box'] = fake_bbox #for avoid bug
+        input_dict['text'] = text
+        transform = make_transforms(imsize=image_size)
+        input_dict = transform(input_dict)
+        img = input_dict['img']  #
+        img_mask = input_dict['mask']  #
+        img_data = misc.NestedTensor(img.unsqueeze(0), img_mask.unsqueeze(0))
+        text_data = misc.NestedTensor(word_id.unsqueeze(0), word_mask.unsqueeze(0))
+    ## model infer
+    img_data = img_data.to(device)
+    text_data = text_data.to(device)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(img_data, text_data)
+        pred_box = outputs['pred_box']
+        pred_box = xywh2xyxy(pred_box.detach().cpu())*image_size
+        pred_box = pred_box.numpy()[0]
+        pred_box = [round(pred_box[0]), round(pred_box[1]), round(pred_box[2]), round(pred_box[3])]
+        output_img = visualBBox(orig_img, pred_box, bbox)
+    return output_img
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # device = torch.device("mps")
+    image_size = 640 # hyper parameters
+    ## build data
+    # case1
+    img_path = "images/649af982-e3af4e3a-75013d30-cdc71514-a34738fd.jpg"
+    phrase = 'Small left apical pneumothorax'
+    bbox = [332, 28, 141, 48]  # xywh
+    # # case2
+    # img_path = 'files/p10/p10977201/s59062881/00363400-cee06fa7-8c2ca1f7-2678a170-b3a62a6e.jpg'
+    # phrase = 'small apical pneumothorax'
+    # bbox = [161, 134, 111, 37]
+    # # case3
+    # img_path = 'files/p18/p18426683/s59612243/95423e8e-45dff550-563d3eba-b8bc94be-a87f5a1d.jpg'
+    # phrase = 'cardiac silhouette enlarged'
+    # bbox = [196, 312, 371, 231]
+    # # case4
+    # img_path = 'files/p10/p10048451/s53489305/4b7f7a4c-18c39245-53724c25-06878595-7e41bb94.jpg'
+    # phrase = 'Focal opacity in the lingular lobe'
+    # bbox = [467, 373, 131, 189]
+    # # case5
+    # img_path = 'files/p19/p19757720/s59572378/13255e1f-91b7b172-02baaeee-340ec493-0e531681.jpg'
+    # phrase = 'multisegmental right upper lobe consolidation is present'
+    # bbox = [9, 86, 232, 278]
+    # # case6
+    # img_path = 'files/p10/p10469621/s56786891/04e10148-c36f7afb-d0aaf964-152d8a5d-a02ab550.jpg'
+    # phrase = 'right middle lobe opacity, suspicious for pneumonia in the proper clinical setting'
+    # bbox = [108, 405, 162, 83]
+    # # case7
+    # img_path = 'files/p10/p10670818/s50191454/1176839d-cf4f677f-d597a1ef-548bc32a-c05429f3.jpg'
+    # phrase = 'Newly appeared lingular opacity'
+    # bbox = [392, 297, 141, 151]
+    bbox = bbox[:2] + [bbox[0]+bbox[2], bbox[1]+bbox[3]] # xywh2xyxy
+    ## encode phrase to bert input
+    examples = data_loader.read_examples(phrase, 1)
+    tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=True)
+    features = data_loader.convert_examples_to_features(
+        examples=examples, seq_length=args.max_query_len, tokenizer=tokenizer, usemarker=None)
+    word_id = torch.tensor(features[0].input_ids)  #
+    word_mask = torch.tensor(features[0].input_mask)  #
+    ## read and transform image
+    input_dict = dict()
+    img = Image.open(img_path).convert("RGB")
+    input_dict['img'] = img
+    fake_bbox = torch.tensor(np.array([0,0,0,0], dtype=int)).float() #for avoid bug
+    input_dict['box'] = fake_bbox #for avoid bug
+    input_dict['text'] = phrase
+    transform = make_transforms(imsize=image_size)
+    input_dict = transform(input_dict)
+    img = input_dict['img']  #
+    img_mask = input_dict['mask']  #
+    # if bbox is not None:
+    #     bbox = input_dict['box']  #
+    img_data = misc.NestedTensor(img.unsqueeze(0), img_mask.unsqueeze(0))
+    text_data = misc.NestedTensor(word_id.unsqueeze(0), word_mask.unsqueeze(0))
+    ## build model
+    model = build_model(args)
+    model.to(device)
+    checkpoint = torch.load(args.eval_model, map_location='cpu')
+    model.load_state_dict(checkpoint['model'])
+    ## model infer
+    img_data = img_data.to(device)
+    text_data = text_data.to(device)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(img_data, text_data)
+        pred_box = outputs['pred_box']
+        pred_box = xywh2xyxy(pred_box.detach().cpu())*image_size
+        pred_box = pred_box.numpy()[0]
+        pred_box = [round(pred_box[0]), round(pred_box[1]), round(pred_box[2]), round(pred_box[3])]
+        visualBBox(img_path, pred_box, bbox)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('TransVG evaluation script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    main(args)

med_rpg/models/MHA.py ADDED Viewed

	@@ -0,0 +1,467 @@

+import torch
+from torch import Tensor
+from torch.nn.init import xavier_uniform_
+from torch.nn.init import constant_
+from torch.nn.init import xavier_normal_
+from torch.nn.parameter import Parameter
+from typing import Tuple, Optional
+from torch.nn.modules.module import Module
+from torch.nn.modules.linear import NonDynamicallyQuantizableLinear as _LinearWithBias
+from torch.nn.functional import linear, pad, softmax, dropout
+from torch.overrides import has_torch_function, handle_torch_function
+import warnings
+import math
+# import torch
+# from torch._C import _infer_size, _add_docstr
+# from . import _reduction as _Reduction
+# from .modules import utils
+# from .modules.utils import _single, _pair, _triple, _list_with_default
+# from . import grad  # noqa: F401
+# from torch import _VF
+# from .._jit_internal import boolean_dispatch, List, Optional, _overload, Tuple
+# from ..overrides import has_torch_function, handle_torch_function
+class MultiheadAttention(Module):
+    r"""Allows the model to jointly attend to information
+    from different representation subspaces.
+    See reference: Attention Is All You Need
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+    Args:
+        embed_dim: total dimension of the model.
+        num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
+        add_bias_kv: add bias to the key and value sequences at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        kdim: total number of features in key. Default: None.
+        vdim: total number of features in value. Default: None.
+        Note: if kdim and vdim are None, they will be set to embed_dim such that
+        query, key, and value have the same number of features.
+    Examples::
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    """
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        if self._qkv_same_embed_dim is False:
+            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+            self.register_parameter('in_proj_weight', None)
+        else:
+            self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
+            self.register_parameter('q_proj_weight', None)
+            self.register_parameter('k_proj_weight', None)
+            self.register_parameter('v_proj_weight', None)
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = _LinearWithBias(embed_dim, embed_dim)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self._reset_parameters()
+    def _reset_parameters(self):
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.)
+            constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+        super(MultiheadAttention, self).__setstate__(state)
+    def forward(self, query, key, value, key_padding_mask=None,
+                need_weights=True, attn_mask=None):
+        # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
+        r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. When given a binary mask and a value is True,
+            the corresponding value on the attention layer will be ignored. When given
+            a byte mask and a value is non-zero, the corresponding value on the attention
+            layer will be ignored
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+    Shape:
+        - Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the position
+          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+        """
+        if not self._qkv_same_embed_dim:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight)
+        else:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask)
+def multi_head_attention_forward(query: Tensor,
+                                 key: Tensor,
+                                 value: Tensor,
+                                 embed_dim_to_check: int,
+                                 num_heads: int,
+                                 in_proj_weight: Tensor,
+                                 in_proj_bias: Tensor,
+                                 bias_k: Optional[Tensor],
+                                 bias_v: Optional[Tensor],
+                                 add_zero_attn: bool,
+                                 dropout_p: float,
+                                 out_proj_weight: Tensor,
+                                 out_proj_bias: Tensor,
+                                 training: bool = True,
+                                 key_padding_mask: Optional[Tensor] = None,
+                                 need_weights: bool = True,
+                                 attn_mask: Optional[Tensor] = None,
+                                 use_separate_proj_weight: bool = False,
+                                 q_proj_weight: Optional[Tensor] = None,
+                                 k_proj_weight: Optional[Tensor] = None,
+                                 v_proj_weight: Optional[Tensor] = None,
+                                 static_k: Optional[Tensor] = None,
+                                 static_v: Optional[Tensor] = None
+                                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+    Shape:
+        Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
+          will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+    """
+    if not torch.jit.is_scripting():
+        tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
+                    out_proj_weight, out_proj_bias)
+        if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
+            return handle_torch_function(
+                multi_head_attention_forward, tens_ops, query, key, value,
+                embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
+                bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
+                out_proj_bias, training=training, key_padding_mask=key_padding_mask,
+                need_weights=need_weights, attn_mask=attn_mask,
+                use_separate_proj_weight=use_separate_proj_weight,
+                q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
+                v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+    if not use_separate_proj_weight:
+        if torch.equal(query, key) and torch.equal(key, value):
+            # self-attention
+            q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
+        elif torch.equal(key, value):
+            # encoder-decoder attention
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = linear(query, _w, _b)
+            if key is None:
+                assert value is None
+                k = None
+                v = None
+            else:
+                # This is inline in_proj function with in_proj_weight and in_proj_bias
+                _b = in_proj_bias
+                _start = embed_dim
+                _end = None
+                _w = in_proj_weight[_start:, :]
+                if _b is not None:
+                    _b = _b[_start:]
+                k, v = linear(key, _w, _b).chunk(2, dim=-1)
+        else:
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = linear(query, _w, _b)
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim
+            _end = embed_dim * 2
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            k = linear(key, _w, _b)
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim * 2
+            _end = None
+            _w = in_proj_weight[_start:, :]
+            if _b is not None:
+                _b = _b[_start:]
+            v = linear(value, _w, _b)
+    else:
+        q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
+        len1, len2 = q_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == query.size(-1)
+        k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
+        len1, len2 = k_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == key.size(-1)
+        v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
+        len1, len2 = v_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == value.size(-1)
+        if in_proj_bias is not None:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
+            k = linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)])
+            v = linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
+        else:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias)
+            k = linear(key, k_proj_weight_non_opt, in_proj_bias)
+            v = linear(value, v_proj_weight_non_opt, in_proj_bias)
+    q = q * scaling
+    if attn_mask is not None:
+        assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
+            attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
+            'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+            attn_mask = attn_mask.to(torch.bool)
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 2D attn_mask is not correct.')
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 3D attn_mask is not correct.')
+        else:
+            raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+        key_padding_mask = key_padding_mask.to(torch.bool)
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, "bias cannot be added to static key."
+            assert static_v is None, "bias cannot be added to static value."
+    else:
+        assert bias_k is None
+        assert bias_v is None
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == head_dim
+        v = static_v
+    src_len = k.size(1)
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+        else:
+            attn_output_weights += attn_mask
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+    attn_output_weights = softmax(
+        attn_output_weights, dim=-1)
+    attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
+    attn_output = torch.bmm(attn_output_weights, v)
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / num_heads
+    else:
+        return attn_output, None

med_rpg/models/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .trans_vg_ca import TransVG_ca
+def build_model(args):
+    if args.model_name == 'TransVG_ca':
+        return TransVG_ca(args)

med_rpg/models/__pycache__/MHA.cpython-310.pyc ADDED Viewed

Binary file (15.6 kB). View file

med_rpg/models/__pycache__/MHA.cpython-37.pyc ADDED Viewed

Binary file (15.4 kB). View file

med_rpg/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (342 Bytes). View file

med_rpg/models/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (331 Bytes). View file

med_rpg/models/__pycache__/trans_vg_ca.cpython-310.pyc ADDED Viewed

Binary file (3.04 kB). View file

med_rpg/models/__pycache__/trans_vg_ca.cpython-37.pyc ADDED Viewed

Binary file (3.02 kB). View file

med_rpg/models/__pycache__/vl_transformer.cpython-310.pyc ADDED Viewed

Binary file (5.51 kB). View file

med_rpg/models/__pycache__/vl_transformer.cpython-37.pyc ADDED Viewed

Binary file (5.36 kB). View file

med_rpg/models/language_model/__init__.py ADDED Viewed

File without changes

med_rpg/models/language_model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (161 Bytes). View file

med_rpg/models/language_model/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (156 Bytes). View file

med_rpg/models/language_model/__pycache__/bert.cpython-310.pyc ADDED Viewed

Binary file (1.74 kB). View file

med_rpg/models/language_model/__pycache__/bert.cpython-37.pyc ADDED Viewed

Binary file (1.71 kB). View file

med_rpg/models/language_model/bert.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Backbone modules.
+"""
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+from torch import nn
+from typing import Dict, List
+from utils.misc import NestedTensor, is_main_process
+# from .position_encoding import build_position_encoding
+# from pytorch_pretrained_bert.modeling import BertModel
+# from transformers import BertModel
+from transformers import AutoTokenizer, AutoModel
+class BERT(nn.Module):
+    def __init__(self, name: str, train_bert: bool, hidden_dim: int, max_len: int, enc_num):
+        super().__init__()
+        # if name == 'bert-base-uncased' :
+        #     self.num_channels = 768
+        # else:
+        #     self.num_channels = 1024
+        self.num_channels = 768
+        self.enc_num = enc_num
+        self.bert = AutoModel.from_pretrained(name)
+        if not train_bert:
+            for parameter in self.bert.parameters():
+                parameter.requires_grad_(False)
+    def forward(self, tensor_list: NestedTensor):
+        if self.enc_num > 0:
+            # # pytorch_pretrained_bert version
+            # all_encoder_layers, _ = self.bert(tensor_list.tensors, token_type_ids=None, attention_mask=tensor_list.mask)
+            # # use the output of the X-th transformer encoder layers
+            # xs = all_encoder_layers[self.enc_num - 1]
+            # transformers bert version
+            bert_output = self.bert(tensor_list.tensors, token_type_ids=None, attention_mask=tensor_list.mask)
+            xs = bert_output.last_hidden_state
+        else:
+            xs = self.bert.embeddings.word_embeddings(tensor_list.tensors)
+        mask = tensor_list.mask.to(torch.bool)
+        mask = ~mask
+        out = NestedTensor(xs, mask)
+        return out
+def build_bert(args):
+    # position_embedding = build_position_encoding(args)
+    train_bert = args.lr_bert > 0
+    bert = BERT(args.bert_model, train_bert, args.hidden_dim, args.max_query_len, args.bert_enc_num)
+    # model = Joiner(bert, position_embedding)
+    # model.num_channels = bert.num_channels
+    return bert

med_rpg/models/trans_vg_ca.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+# from pytorch_pretrained_bert.modeling import BertModel
+from .visual_model.detr import build_detr
+from .language_model.bert import build_bert
+from .vl_transformer import build_vl_transformer
+import copy
+# from utils.box_utils import xywh2xyxy
+class TransVG_ca(nn.Module):
+    def __init__(self, args):
+        super(TransVG_ca, self).__init__()
+        hidden_dim = args.vl_hidden_dim
+        divisor = 16 if args.dilation else 32
+        self.num_visu_token = int((args.imsize / divisor) ** 2)
+        self.num_text_token = args.max_query_len
+        self.visumodel = build_detr(args)
+        self.textmodel = build_bert(args)
+        num_total = self.num_visu_token + self.num_text_token + 1
+        self.vl_pos_embed = nn.Embedding(num_total, hidden_dim)
+        self.reg_token = nn.Embedding(1, hidden_dim)
+        self.visu_proj = nn.Linear(self.visumodel.num_channels, hidden_dim)
+        self.text_proj = nn.Linear(self.textmodel.num_channels, hidden_dim)
+        self.vl_transformer = build_vl_transformer(args)
+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+    def forward(self, img_data, text_data):
+        bs = img_data.tensors.shape[0]
+        # visual backbone
+        visu_mask, visu_src = self.visumodel(img_data)
+        visu_src = self.visu_proj(visu_src) # (N*B)xC  shape: torch.Size([8, 400, 256])
+        # language bert
+        text_fea = self.textmodel(text_data)
+        text_src, text_mask = text_fea.decompose() # torch.Size([8, 20, 768]); torch.Size([8, 20])
+        assert text_mask is not None
+        text_src = self.text_proj(text_src)  # torch.Size([8, 20, 256])
+        # permute BxLenxC to LenxBxC
+        text_src = text_src.permute(1, 0, 2)  # torch.Size([20, 8, 256])
+        text_mask = text_mask.flatten(1)  # torch.Size([8, 20])
+        # target regression token
+        tgt_src = self.reg_token.weight.unsqueeze(1).repeat(1, bs, 1)
+        tgt_mask = torch.zeros((bs, 1)).to(tgt_src.device).to(torch.bool)
+        vl_src = torch.cat([tgt_src, text_src, visu_src], dim=0)
+        vl_mask = torch.cat([tgt_mask, text_mask, visu_mask], dim=1)
+        vl_pos = self.vl_pos_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+        vg_hs, attn_output_weights = self.vl_transformer(vl_src, vl_mask, vl_pos) # (1+L+N)xBxC
+        ##
+        # with torch.no_grad():
+        #     vg_hs_fool, _ = self.vl_transformer(vl_src, vl_mask, vl_pos)
+        #     vg_reg_fool = vg_hs_fool[0]
+        #     pred_box_fool = self.bbox_embed(vg_reg_fool).sigmoid()
+        ##
+        vg_reg = vg_hs[0]
+        vg_text = vg_hs[1:21]
+        vg_visu = vg_hs[21:]
+        pred_box = self.bbox_embed(vg_reg).sigmoid()
+        return {'pred_box': pred_box, 'vg_visu': vg_visu, 'vg_text': vg_text, 'text_mask': text_mask, \
+            'attn_output_weights': attn_output_weights, 'vg_reg': vg_reg, 'vg_hs': vg_hs, 'text_data': text_data}
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x

med_rpg/models/transformer.py ADDED Viewed

	@@ -0,0 +1,314 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DETR Transformer class.
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import copy
+from typing import Optional, List
+import torch
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
+from torch import nn, Tensor
+class Transformer(nn.Module):
+    """
+        Modified based on deformable transformer to enable multi-scale.
+    """
+    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
+                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False, num_feature_levels=1,
+                 return_intermediate_dec=False):
+        super().__init__()
+        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec)
+        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+        self._reset_parameters()
+        self.d_model = d_model
+        self.nhead = nhead
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        normal_(self.level_embed)
+    def forward(self, src, mask, pos_embed, query_embed=None, lang_feat=None):
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            src = src.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1).transpose(0, 1)
+        mask_flatten = torch.cat(mask_flatten, 1).transpose(0, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1).transpose(0, 1)
+        query_embed, tgt = torch.split(query_embed, c, dim=1)
+        query_embed = query_embed.unsqueeze(1).expand(-1, bs, -1)
+        tgt = tgt.unsqueeze(1).expand(-1, bs, -1)
+        lang_feat = lang_feat.transpose(0, 1)
+        query_embed = query_embed + lang_feat
+        memory = self.encoder(src_flatten, src_key_padding_mask=mask_flatten, pos=lvl_pos_embed_flatten)
+        hs = self.decoder(tgt, memory, memory_key_padding_mask=mask_flatten,
+                          pos=lvl_pos_embed_flatten, query_pos=query_embed)
+        return hs.transpose(1, 2), #memory.permute(1, 2, 0).view(bs, c, h, w)
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(self, src,
+                mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=mask,
+                           src_key_padding_mask=src_key_padding_mask, pos=pos)
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+class TransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        output = tgt
+        intermediate = []
+        for layer in self.layers:
+            output = layer(output, memory, tgt_mask=tgt_mask,
+                           memory_mask=memory_mask,
+                           tgt_key_padding_mask=tgt_key_padding_mask,
+                           memory_key_padding_mask=memory_key_padding_mask,
+                           pos=pos, query_pos=query_pos)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+        return output.unsqueeze(0)
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self,
+                     src,
+                     src_mask: Optional[Tensor] = None,
+                     src_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+    def forward_pre(self, src,
+                    src_mask: Optional[Tensor] = None,
+                    src_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+    def forward(self, src,
+                src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt, memory,
+                     tgt_mask: Optional[Tensor] = None,
+                     memory_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+    def forward_pre(self, tgt, memory,
+                    tgt_mask: Optional[Tensor] = None,
+                    memory_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def build_transformer(args):
+    return Transformer(
+        d_model=args.hidden_dim,
+        nhead=args.nheads,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        dim_feedforward=args.dim_feedforward,
+        dropout=args.dropout,
+        activation="relu",
+        return_intermediate_dec=True,
+        num_feature_levels=args.num_feature_levels)
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")

med_rpg/models/visual_model/__init__.py ADDED Viewed

File without changes

med_rpg/models/visual_model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (159 Bytes). View file

med_rpg/models/visual_model/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (154 Bytes). View file

med_rpg/models/visual_model/__pycache__/backbone.cpython-310.pyc ADDED Viewed

Binary file (4.59 kB). View file

med_rpg/models/visual_model/__pycache__/backbone.cpython-37.pyc ADDED Viewed

Binary file (4.56 kB). View file

med_rpg/models/visual_model/__pycache__/detr.cpython-310.pyc ADDED Viewed

Binary file (3.75 kB). View file

med_rpg/models/visual_model/__pycache__/detr.cpython-37.pyc ADDED Viewed

Binary file (3.75 kB). View file

med_rpg/models/visual_model/__pycache__/position_encoding.cpython-310.pyc ADDED Viewed

Binary file (3.54 kB). View file

med_rpg/models/visual_model/__pycache__/position_encoding.cpython-37.pyc ADDED Viewed

Binary file (3.52 kB). View file

med_rpg/models/visual_model/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (10.2 kB). View file

med_rpg/models/visual_model/__pycache__/transformer.cpython-37.pyc ADDED Viewed

Binary file (10.1 kB). View file

med_rpg/models/visual_model/backbone.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Backbone modules.
+"""
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+from utils.misc import NestedTensor, is_main_process
+from .position_encoding import build_position_encoding
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+class BackboneBase(nn.Module):
+    def __init__(self, name:str, backbone: nn.Module, num_channels: int, return_interm_layers: bool):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                parameter.requires_grad_(False)
+        if return_interm_layers:
+            return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+        else:
+            return_layers = {'layer4': "0"}
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        return out
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(self, name: str,
+                 return_interm_layers: bool,
+                 dilation: bool):
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=False, norm_layer=FrozenBatchNorm2d)
+            # pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
+        assert name in ('resnet50', 'resnet101')
+        num_channels = 2048
+        super().__init__(name, backbone, num_channels, return_interm_layers)
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.tensors.dtype))
+        return out, pos
+def build_backbone(args):
+    position_embedding = build_position_encoding(args)
+    # train_backbone = args.lr_detr > 0
+    return_interm_layers = False
+    backbone = Backbone(args.backbone, return_interm_layers, args.dilation)
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = backbone.num_channels
+    return model