Spaces:

franchesoni
/

segmentation_features

Sleeping

App Files Files Community

franchesoni commited on Mar 11

Commit

e1b51e5

•

1 Parent(s): 2df2c09

v0

Browse files

Files changed (6) hide show

.gitignore +5 -0
app.py +197 -0
busam.py +137 -0
losses.py +211 -0
network.py +267 -0
utils.py +219 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+*.sh
+*.pth
+*.pkl
+__pycache__/
+flagged/

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from PIL import Image
+import torch
+import numpy as np
+import gradio as gr
+from pathlib import Path
+from busam import Busam
+resize_to = 512
+checkpoint = "weights.pth"
+device = "cpu"
+print("Loading model...")
+busam = Busam(checkpoint=checkpoint, device=device, side=resize_to)
+minmaxnorm = lambda x: (x - x.min()) / (x.max() - x.min())
+def edge_inference(img, algorithm, th_low=None, th_high=None):
+    algorithm = algorithm.lower()
+    print("Loading image...")
+    img = np.array(img[:, :, :3])
+    print("Getting features...")
+    pred, size = busam.process_image(img, do_activate=True)
+    print("Computing sobel...")
+    if algorithm == "sobel":
+        edge = busam.sobel_from_pred(pred, size)
+    elif algorithm == "canny":
+        th_low, th_high = th_low or 5000, th_high or 10000
+        edge = busam.canny_from_pred(pred, size, th_low=th_low, th_high=th_high)
+    else:
+        raise ValueError("algorithm should be sobel or canny")
+    edge = edge.cpu().numpy() if isinstance(edge, torch.Tensor) else edge
+    print("Done")
+    return Image.fromarray(
+        (minmaxnorm(edge) * 255).astype(np.uint8)
+    ).resize(size[::-1])
+def dimred_inference(
+    img,
+    algorithm,
+    resample_pct,
+):
+    algorithm = algorithm.lower()
+    img = np.array(img[:, :, :3])
+    print("Getting features...")
+    pred, size = busam.process_image(img, do_activate=True)
+    # pred is 1, F, S, S
+    assert pred.shape[1] >= 3, "should have at least 3 channels"
+    if algorithm == 'pca':
+        from sklearn.decomposition import PCA
+        reducer = PCA(n_components=3)
+    elif algorithm == 'tsne':
+        from sklearn.manifold import TSNE
+        reducer = TSNE(n_components=3)
+    elif algorithm == 'umap':
+        from umap import UMAP
+        reducer = UMAP(n_components=3)
+    else:
+        raise ValueError('algorithm should be pca, tsne or umap')
+    np_y_hat = pred.detach().cpu().permute(1, 0, 2, 3).numpy()  # F, B, H, W
+    np_y_hat = np_y_hat.reshape(np_y_hat.shape[0], -1)  # F, BHW
+    np_y_hat = np_y_hat.T  # BHW, F
+    resample_pct = 10**resample_pct
+    resample_size = int(resample_pct * np_y_hat.shape[0])
+    sampled_pixels = np_y_hat[:: np_y_hat.shape[0] // resample_size]
+    print("dim reduction fit..." + " " * 30, end="\r")
+    reducer = reducer.fit(sampled_pixels)
+    print("dim reduction transform..." + " " * 30, end="\r")
+    reducer.transform(np_y_hat[:10])  # to numba compile the function
+    np_y_hat = reducer.transform(np_y_hat)  # BHW, 3
+    print()
+    print('Done. Saving...')
+    # revert back to original shape
+    colors = np_y_hat.reshape(pred.shape[2], pred.shape[3], 3)
+    return Image.fromarray((minmaxnorm(colors) * 255).astype(np.uint8)).resize(
+            size[::-1]
+        )
+def segmentation_inference(img, algorithm, scale):
+    algorithm = algorithm.lower()
+    img = np.array(img[:, :, :3])
+    print("Getting features...")
+    pred, size = busam.process_image(img, do_activate=True)
+    print("Computing segmentation...")
+    if algorithm == "kmeans":
+        from sklearn.cluster import KMeans
+        n_clusters = int(100 / 100**scale)
+        kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(
+            pred.view(pred.shape[1], -1).T
+        )
+        labels = kmeans.labels_
+        labels = labels.reshape(pred.shape[2], pred.shape[3])
+    elif algorithm == "felzenszwalb":
+        from skimage.segmentation import felzenszwalb
+        labels = felzenszwalb(
+            (minmaxnorm(pred[0].cpu().numpy()) * 255).astype(np.uint8).transpose(1, 2, 0),
+            scale=10**(8*scale-3),
+            sigma=0,
+            min_size=50,
+        )
+    elif algorithm == "slic":
+        from skimage.segmentation import slic
+        labels = slic(
+            (minmaxnorm(pred[0].cpu().numpy()) * 255).astype(np.uint8).transpose(1, 2, 0),
+            n_segments = int(100 / 100**scale),
+            compactness=0.00001,
+            sigma=1,
+        )
+    else:
+        raise ValueError("algorithm should be kmeans, felzenszwalb or slic")
+    print("Done")
+    # the labels have values that are usually close to each other in the image and in magnitude, which complicates visualization
+    # shuffle the labels to make them more visually distinct
+    out = labels.copy()
+    out[labels % 4 == 0] = labels[labels % 4 == 0] * 1 / 4
+    out[labels % 4 == 1] = labels[labels % 4 == 1] * 4 // 4 + 1
+    out[labels % 4 == 2] = labels[labels % 4 == 2] * 2 // 4 + 2
+    out[labels % 4 == 3] = labels[labels % 4 == 3] * 3 // 4 + 3
+    return Image.fromarray(
+        (minmaxnorm(out) * 255).astype(np.uint8)
+    ).resize(size[::-1])
+def one_click_segmentation(img, row, col, threshold):
+    row, col = int(row), int(col)
+    img = np.array(img[:, :, :3])
+    click_map = np.zeros(img.shape[:2], dtype=bool)
+    click_map[max(0, row-5):min(img.shape[0], row+5), col] = True
+    click_map[row, max(0, col-5):min(img.shape[1], col+5)] = True
+    print("Getting features...")
+    pred, size = busam.process_image(img, do_activate=True)
+    print("Getting mask...")
+    mask = busam.get_mask((pred, size), (row, col))
+    print("Done")
+    print('shapes=', img.shape, mask.shape, click_map.shape)
+    return (img, [(mask, 'Prediction'), (click_map, 'Click')])
+with gr.Blocks() as demo:
+    with gr.Tab('Edge detection'):
+        algorithm = "canny"
+        with gr.Row():
+            def enable_sliders(algorithm):
+                algorithm = algorithm.lower()
+                return gr.Slider(visible=algorithm == "canny"), gr.Slider(visible=algorithm == "canny")
+            with gr.Column():
+                image_input = gr.Image(label="Input Image")
+                run_button = gr.Button("Run")
+                algorithm = gr.Radio(["Sobel", "Canny"], label="Algorithm", value="Sobel")
+                # add sliders for th_low, th_high
+                th_low_slider = gr.Slider(0, 32768, 10000, label="Canny's low threshold", visible=False)
+                th_high_slider = gr.Slider(0, 32768, 20000, label="Canny's high threshold", visible=False)
+            algorithm.change(enable_sliders, inputs=[algorithm], outputs=[th_low_slider, th_high_slider])
+            with gr.Column():
+                output_image = gr.Image(label="Output Image")
+            run_button.click(edge_inference, inputs=[image_input, algorithm, th_low_slider, th_high_slider], outputs=output_image)
+        gr.Examples([str(p) for p in Path('demoimgs').glob('*')], inputs=image_input)
+    with gr.Tab('Reduction to 3D'):
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(label="Input Image")
+                algorithm = gr.Radio(["PCA", "TSNE", "UMAP"], label="Algorithm")
+                run_button = gr.Button("Run")
+                gr.Markdown("⚠️ UMAP is slow, TSNE is ultra-slow, use resample x<-3 ⚠️")
+                resample_pct = gr.Slider(-5, 0, -3, label="Resample (10^x)*100%")
+            with gr.Column():
+                output_image = gr.Image(label="Output Image")
+            run_button.click(dimred_inference, inputs=[image_input, algorithm, resample_pct], outputs=output_image)
+        gr.Examples([str(p) for p in Path('demoimgs').glob('*')], inputs=image_input)
+    with gr.Tab('Classical Segmentation'):
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(label="Input Image")
+                algorithm = gr.Radio(['KMeans', 'Felzenszwalb', 'SLIC'], label="Algorithm", value="SLIC")
+                scale = gr.Slider(0.1, 1.0, 0.5, label="Scale")
+                run_button = gr.Button("Run")
+            with gr.Column():
+                output_image = gr.Image(label="Output Image")
+            run_button.click(segmentation_inference, inputs=[image_input, algorithm, scale], outputs=output_image)
+        gr.Examples([str(p) for p in Path('demoimgs').glob('*')], inputs=image_input)
+    with gr.Tab('One-click segmentation'):
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(label="Input Image")
+                threshold = gr.Slider(0, 1, 0.5, label="Threshold")
+                with gr.Row():
+                    row = gr.Textbox(10, label="Click's row")
+                    col = gr.Textbox(10, label="Click's column")
+                run_button = gr.Button("Run")
+            with gr.Column():
+                output_image = gr.AnnotatedImage(label="Output")
+            run_button.click(one_click_segmentation, inputs=[image_input, row, col, threshold], outputs=output_image)
+        gr.Examples([str(p) for p in Path('demoimgs').glob('*')], inputs=image_input)
+demo.launch(share=False)

busam.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+from torch import nn
+import numpy as np
+from cv2 import resize
+import cv2
+from pathlib import Path
+from network import EfficientViT_l1_r224
+from losses import IISLoss, activate
+from utils import minmaxnorm, load_from_ckpt
+class Busam:
+    def __init__(self, checkpoint, device, side=224):
+        out_channels = 16
+        use_norm_params = False
+        net = EfficientViT_l1_r224(
+            out_channels=out_channels, use_norm_params=use_norm_params, pretrained=False
+        )
+        net = load_from_ckpt(net, checkpoint)
+        net = net.to(device)
+        net.eval()
+        self.net = net
+        self.device = device
+        self.side = side
+    def prepare_img(self, img):
+        """
+        assume H, W, 3 image
+        """
+        assert len(img.shape) == 3, "should be H, W, 3 but is " + str(img.shape)
+        assert img.shape[2] == 3, "should be H, W, 3 but is " + str(img.shape)
+        assert img.min() >= 0, "min should be more than 0 but is " + str(img.min())
+        assert img.max() <= 255, "max should be less than 255 but is " + str(img.max())
+        assert img.dtype == np.uint8, "dtype should be np.uint8 but is " + str(
+            img.dtype
+        )
+        nimg = resize(img, (self.side, self.side))
+        tensorimg = (
+            (torch.from_numpy(nimg / 255).permute(2, 0, 1) - 0.5)
+            .float()[None]
+            .to(self.device)
+        )
+        return tensorimg
+    def process_image(self, img, do_activate=False):
+        with torch.no_grad():
+            x = self.prepare_img(img)
+            pred = self.net(x)
+        H, W = img.shape[:2]
+        if do_activate:
+            B, F, pH, pW = pred.shape
+            features, _, _, _ = activate(
+                pred.view(F, pH * pW), None, "symlog", False, False, False
+            )
+            pred = features.view(B, F, pH, pW)
+        return pred, (H, W)
+    def get_mask(self, aux, click):
+        """assume click is (row, col)"""
+        pred = aux[0][0]  # remove batch dim
+        oH, oW = aux[1]
+        F, H, W = pred.shape
+        features = pred.view(F, H * W)
+        rclick = click[0] * H // oH, click[1] * W // oW
+        sindex = rclick[0] * W + rclick[1]
+        mask = IISLoss.get_mask_from_query(features, sindex)
+        mask = mask.reshape(H, W)
+        mask = (
+            resize((mask.cpu().numpy() * 255).astype(np.uint8), (oW, oH)) > 100
+        ).astype(bool)
+        return mask
+    def get_gradients(self, pred, size):
+        F, H, W = pred[0].shape
+        sobel_x = (
+            torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]).float().to(pred.device)
+        )
+        sobel_y = sobel_x.T
+        sobel_x = sobel_x.repeat(F, 1, 1, 1)
+        sobel_y = sobel_y.repeat(F, 1, 1, 1)
+        edge_x = torch.nn.functional.conv2d(pred, sobel_x, padding=1, groups=F).view(
+            F, H, W
+        )  # 1, F, H, W
+        edge_y = torch.nn.functional.conv2d(pred, sobel_y, padding=1, groups=F).view(
+            F, H, W
+        )
+        edge_x = torch.norm(edge_x, dim=0, p=2)  # will take sqrt
+        edge_y = torch.norm(edge_y, dim=0, p=2)  # H, W
+        return edge_x, edge_y
+    def sobel_from_pred(self, pred, size):
+        edge_x, edge_y = self.get_gradients(pred, size)
+        edge = torch.sqrt(edge_x**2 + edge_y**2)
+        return edge
+    def canny_from_pred(self, pred, size, th_low=10000, th_high=20000):
+        th_low = th_low or th_high
+        th_high = th_high or th_low
+        edge_x, edge_y = self.get_gradients(pred, size)
+        amin = min(edge_x.min(), edge_y.min())
+        amax = max(edge_x.max(), edge_y.max())
+        edge_x, edge_y = (edge_x - amin) / (amax - amin), (edge_y - amin) / (
+            amax - amin
+        )
+        canny = cv2.Canny(cast_to_int16(edge_x), cast_to_int16(edge_y), th_low, th_high)
+        return canny
+def cast_to_int16(x):
+    if isinstance(x, torch.Tensor):
+        x = x.cpu().numpy()
+    return (x * 32767).astype(np.int16)
+# from segment_anything import sam_model_registry, SamPredictor
+# class SAM:
+#     sam_checkpoint = "sam_vit_b_01ec64.pth"
+#     model_type = "vit_b"
+#     def __init__(self, device):
+#         sam = sam_model_registry[self.model_type](checkpoint=self.sam_checkpoint)
+#         sam.to(device=device)
+#         self.predictor = SamPredictor(sam)
+#     def process_image(self, img):
+#         self.predictor.set_image(img)
+#         return None
+#     def get_mask(self, aux, click):
+#         input_point = np.array([[click[1], click[0]]])
+#         input_label = np.array([1])
+#         masks, scores, logits = self.predictor.predict(
+#             point_coords=input_point, point_labels=input_label, multimask_output=False
+#         )
+#         return masks[0]

losses.py ADDED Viewed

	@@ -0,0 +1,211 @@

+print("Importing standard...")
+from abc import ABC, abstractmethod
+print("Importing external...")
+import torch
+from torch.nn.functional import binary_cross_entropy
+# from matplotlib import pyplot as plt
+print("Importing internal...")
+from utils import preprocess_masks_features, get_row_col, symlog, calculate_iou
+######### BINARY LOSSES ###############
+def my_lovasz_hinge(logits, gt, downsample=False):
+    if downsample:
+        offset = int(torch.randint(downsample - 1, (1,)))
+        logits, gt = logits[:, offset::downsample], gt[:, offset::downsample]
+        # B, HW
+    gt = 1.0 * gt  # go float
+    areas = gt.sum(dim=1, keepdims=True)  # B, 1
+    # per_image = True, ignore = None
+    signs = 2 * gt - 1
+    errors = 1 - logits * signs
+    errors_sorted, perm = torch.sort(errors, dim=1, descending=True)
+    gt_sorted = torch.gather(gt, 1, perm)  # B, HW
+    # lovasz grad
+    intersection = areas - gt_sorted.cumsum(dim=1)  # B, HW
+    union = areas + (1 - gt_sorted).cumsum(dim=1)  # B, HW
+    jaccard = 1 - intersection / union  # B, HW
+    jaccard[:, 1:] = jaccard[:, 1:] - jaccard[:, :-1]
+    loss = (torch.relu(errors_sorted) * jaccard).sum(dim=1)  # B,
+    return torch.nanmean(loss)
+def focal_loss(scores, targets, alpha=0.25, gamma=2):
+    p = scores
+    ce_loss = binary_cross_entropy(p, targets, reduction="none")
+    p_t = p * targets + (1 - p) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+    return loss
+# also binary_cross_entropy and lovasz
+########## SUBFUNCTIONS ######################3
+def get_distances(features, refs, sigma, norm_p, square_distances, H, W):
+    # features: B, 1, F, HW
+    # refs: B, M, F, 1
+    # sigma: B, M, 1, 1
+    B, M = refs.shape[0], refs.shape[1]
+    distances = torch.norm(
+        features - refs, dim=2, p=norm_p, keepdim=True
+    )  # B, M, 1, H*W
+    distances = distances**2 if square_distances else distances
+    distances = (distances / (2 * sigma**2)).reshape(B, M, H * W)
+    return distances
+def activate(features, masks, activation, use_sigma, offset_pos, ret_prediction):
+    # sigmoid is very similar to exp
+    # prepare features
+    assert activation in ["sigmoid", "symlog"]
+    if masks is None:  # when inferencing
+        B, M = 1, 1
+        F, N = sorted(features.shape)
+        H, W = [int(N ** (0.5))] * 2
+        features = features.reshape(1, 1, -1, H * W)
+    else:
+        masks, features, M, B, H, W, F = preprocess_masks_features(masks, features)
+    # features: B, 1, F, H*W
+    # masks: B, M, 1, H*W
+    if use_sigma:
+        sigma = torch.nn.functional.softplus(features)[:, :, -1:]  # B, 1, 1, H*W
+        features = features[:, :, :-1]
+        F = features.shape[2]
+    else:
+        sigma = 1
+    features = symlog(features) if activation == "symlog" else torch.sigmoid(features)
+    if offset_pos:
+        assert F >= 2
+        row, col = get_row_col(H, W, features.device)
+        row = row.reshape(1, 1, 1, H, 1).expand(B, 1, 1, H, W).reshape(B, 1, 1, H * W)
+        col = col.reshape(1, 1, 1, 1, W).expand(B, 1, 1, H, W).reshape(B, 1, 1, H * W)
+        positional_features = torch.cat([row, col], dim=2)  # B, 1, 2, H*W
+        features[:, :, :2] = features[:, :, :2] + positional_features
+    prediction = features.reshape(B, 1, -1, H, W) if ret_prediction else None
+    if masks is None:
+        features = features.reshape(-1, H * W)
+        sigma = sigma.reshape(-1, H * W) if use_sigma else 1
+        return features, sigma, H, W
+    return features, masks, sigma, prediction, B, M, F, H, W
+class AbstractLoss(ABC):
+    @staticmethod
+    @abstractmethod
+    def loss(features, masks, ret_prediction=False, **kwargs):
+        pass
+    @staticmethod
+    @abstractmethod
+    def get_mask_from_query(features, sindex, **kwargs):
+        pass
+class IISLoss(AbstractLoss):
+    @staticmethod
+    def loss(features, masks, ret_prediction=False, K=3, logger=None):
+        features, masks, sigma, prediction, B, M, F, H, W = activate(
+            features, masks, "symlog", False, False, ret_prediction
+        )
+        rindices = torch.randperm(H * W, device=masks.device)
+        # the following should work if all masks have more than K pixels
+        sindices = torch.stack(
+            [
+                torch.stack([rindices[masks[b, m, 0, rindices]][:K] for m in range(M)])
+                for b in range(B)
+            ]
+        )  # B, M, K
+        feats_at_sindices = torch.gather(
+            features.permute(0, 3, 1, 2).expand(B, H * W, K, F),
+            dim=1,
+            index=sindices.reshape(B, M, K, 1).expand(B, M, K, F),
+        )  # B, M, K, F
+        feats_at_sindices = feats_at_sindices.reshape(B, M, K, F, 1)  # B, M, K, F, 1
+        dists = get_distances(
+            features, feats_at_sindices.reshape(B, M * K, F, 1), sigma, 2, True, H, W
+        )
+        score = torch.exp(-dists)  # B, M*K, H*W [0, 1]
+        targets = (
+            masks.expand(B, M, K, H * W).reshape(B, M * K, H * W).float()
+        )  # B, M, K, H*W
+        floss = focal_loss(score, targets).mean()
+        lloss = my_lovasz_hinge(
+            score.view(B * M * K, H * W) * 2 - 1,
+            targets.view(B * M * K, H * W),
+        )
+        loss = floss + lloss
+        return loss, prediction
+    @staticmethod
+    def get_mask_from_query(features, sindex):
+        features, _, H, W = activate(features, None, "symlog", False, False, False)
+        F = features.shape[0]
+        query_feat = features[:, sindex]
+        dists = get_distances(
+            features.reshape(1, 1, F, H * W),
+            query_feat.reshape(1, 1, F, 1),
+            1,
+            2,
+            True,
+            H,
+            W,
+        )
+        score = torch.exp(-dists)  # 1, H*W
+        pred = score > 0.5
+        return pred
+def iis_iou(features, masks, get_mask_from_query, K=20):
+    masks, features, M, B, H, W, F = preprocess_masks_features(masks, features)
+    # features: B, 1, F, H*W
+    # masks: B, M, 1, H*W
+    rindices = torch.randperm(H * W).to(masks.device)
+    sindices = torch.stack(
+        [
+            torch.stack([rindices[masks[b, m, 0, rindices]][:K] for m in range(M)])
+            for b in range(B)
+        ]
+    )  # B, M, K
+    cum_iou, n_samples = 0, 0
+    for b in range(B):
+        for m in range(M):
+            for k in range(K):
+                sindex = sindices[b, m, k]
+                pred = get_mask_from_query(features[b, 0], sindex)
+                iou = calculate_iou(pred, masks[b, m, 0, :])
+                cum_iou += iou
+                n_samples += 1
+    return cum_iou / n_samples
+losses_names = [
+    "iis",
+]
+#
+def get_loss_class(loss_name):
+    if loss_name == "iis":
+        return IISLoss
+    else:
+        raise NotImplementedError
+def get_get_mask_from_query(loss_name):
+    loss_class = get_loss_class(loss_name)
+    return loss_class.get_mask_from_query
+def get_loss(loss_name):
+    loss_class = get_loss_class(loss_name)
+    return loss_class.loss

network.py ADDED Viewed

	@@ -0,0 +1,267 @@

+print("Importing external...")
+import torch
+from torch import nn
+import torch.nn.functional as F
+from timm.models.efficientvit_mit import (
+    ConvNormAct,
+    FusedMBConv,
+    MBConv,
+    ResidualBlock,
+    efficientvit_l1,
+)
+from timm.layers import GELUTanh
+def val2list(x: list or tuple or any, repeat_time=1):
+    if isinstance(x, (list, tuple)):
+        return list(x)
+    return [x for _ in range(repeat_time)]
+def resize(
+    x: torch.Tensor,
+    size: any or None = None,
+    scale_factor: list[float] or None = None,
+    mode: str = "bicubic",
+    align_corners: bool or None = False,
+) -> torch.Tensor:
+    if mode in {"bilinear", "bicubic"}:
+        return F.interpolate(
+            x,
+            size=size,
+            scale_factor=scale_factor,
+            mode=mode,
+            align_corners=align_corners,
+        )
+    elif mode in {"nearest", "area"}:
+        return F.interpolate(x, size=size, scale_factor=scale_factor, mode=mode)
+    else:
+        raise NotImplementedError(f"resize(mode={mode}) not implemented.")
+class UpSampleLayer(nn.Module):
+    def __init__(
+        self,
+        mode="bicubic",
+        size: int or tuple[int, int] or list[int] or None = None,
+        factor=2,
+        align_corners=False,
+    ):
+        super(UpSampleLayer, self).__init__()
+        self.mode = mode
+        self.size = val2list(size, 2) if size is not None else None
+        self.factor = None if self.size is not None else factor
+        self.align_corners = align_corners
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if (
+            self.size is not None and tuple(x.shape[-2:]) == self.size
+        ) or self.factor == 1:
+            return x
+        return resize(x, self.size, self.factor, self.mode, self.align_corners)
+class DAGBlock(nn.Module):
+    def __init__(
+        self,
+        inputs: dict[str, nn.Module],
+        merge: str,
+        post_input: nn.Module or None,
+        middle: nn.Module,
+        outputs: dict[str, nn.Module],
+    ):
+        super(DAGBlock, self).__init__()
+        self.input_keys = list(inputs.keys())
+        self.input_ops = nn.ModuleList(list(inputs.values()))
+        self.merge = merge
+        self.post_input = post_input
+        self.middle = middle
+        self.output_keys = list(outputs.keys())
+        self.output_ops = nn.ModuleList(list(outputs.values()))
+    def forward(self, feature_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        feat = [
+            op(feature_dict[key]) for key, op in zip(self.input_keys, self.input_ops)
+        ]
+        if self.merge == "add":
+            feat = list_sum(feat)
+        elif self.merge == "cat":
+            feat = torch.concat(feat, dim=1)
+        else:
+            raise NotImplementedError
+        if self.post_input is not None:
+            feat = self.post_input(feat)
+        feat = self.middle(feat)
+        for key, op in zip(self.output_keys, self.output_ops):
+            feature_dict[key] = op(feat)
+        return feature_dict
+def list_sum(x: list) -> any:
+    return x[0] if len(x) == 1 else x[0] + list_sum(x[1:])
+class SegHead(nn.Module):
+    def __init__(
+        self,
+        fid_list: list[str],
+        in_channel_list: list[int],
+        stride_list: list[int],
+        head_stride: int,
+        head_width: int,
+        head_depth: int,
+        expand_ratio: float,
+        middle_op: str,
+        final_expand: float or None,
+        n_classes: int,
+        dropout=0,
+        norm="bn2d",
+        act_func="hswish",
+    ):
+        super(SegHead, self).__init__()
+        # exceptions to adapt effvit to timm
+        if act_func == "gelu":
+            act_func = GELUTanh
+        else:
+            raise ValueError(f"act_func {act_func} not supported")
+        if norm == "bn2d":
+            norm_layer = nn.BatchNorm2d
+        else:
+            raise ValueError(f"norm {norm} not supported")
+        inputs = {}
+        for fid, in_channel, stride in zip(fid_list, in_channel_list, stride_list):
+            factor = stride // head_stride
+            if factor == 1:
+                inputs[fid] = ConvNormAct(
+                    in_channel, head_width, 1, norm_layer=norm_layer, act_layer=act_func
+                )
+            else:
+                inputs[fid] = nn.Sequential(
+                    ConvNormAct(
+                        in_channel,
+                        head_width,
+                        1,
+                        norm_layer=norm_layer,
+                        act_layer=act_func,
+                    ),
+                    UpSampleLayer(factor=factor),
+                )
+        self.in_keys = inputs.keys()
+        self.in_ops = nn.ModuleList(inputs.values())
+        middle = []
+        for _ in range(head_depth):
+            if middle_op == "mbconv":
+                block = MBConv(
+                    head_width,
+                    head_width,
+                    expand_ratio=expand_ratio,
+                    norm_layer=norm_layer,
+                    act_layer=(act_func, act_func, None),
+                )
+            elif middle_op == "fmbconv":
+                block = FusedMBConv(
+                    head_width,
+                    head_width,
+                    expand_ratio=expand_ratio,
+                    norm_layer=norm_layer,
+                    act_layer=(act_func, None),
+                )
+            else:
+                raise NotImplementedError
+            middle.append(ResidualBlock(block, nn.Identity()))
+        self.middle = nn.Sequential(*middle)
+        self.out_layer = nn.Sequential(
+            *[
+                None
+                if final_expand is None
+                else ConvNormAct(
+                    head_width,
+                    head_width * final_expand,
+                    1,
+                    norm_layer=norm_layer,
+                    act_layer=act_func,
+                ),
+                ConvNormAct(
+                    head_width * (final_expand or 1),
+                    n_classes,
+                    1,
+                    bias=True,
+                    dropout=dropout,
+                    norm_layer=None,
+                    act_layer=None,
+                ),
+            ]
+        )
+    def forward(self, feature_map_list):
+        t_feat_maps = [
+            self.in_ops[ind](feature_map_list[ind])
+            for ind in range(len(feature_map_list))
+        ]
+        t_feat_map = list_sum(t_feat_maps)
+        t_feat_map = self.middle(t_feat_map)
+        out = self.out_layer(t_feat_map)
+        return out
+class EfficientViT_l1_r224(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        out_ds_factor=1,
+        decoder_size="small",
+        pretrained=False,
+        use_norm_params=False,
+    ):
+        if decoder_size == "small":
+            head_width = 32
+            head_depth = 1
+            middle_op = "mbconv"
+        elif decoder_size == "medium":
+            head_width = 64
+            head_depth = 3
+            middle_op = "mbconv"
+        elif decoder_size == "large":
+            head_width = 256
+            head_depth = 3
+            middle_op = "fmbconv"
+        super(EfficientViT_l1_r224, self).__init__()
+        self.bbone = efficientvit_l1(
+            num_classes=0, features_only=True, pretrained=pretrained
+        )
+        self.head = SegHead(
+            fid_list=["stage4", "stage3", "stage2"],
+            in_channel_list=[512, 256, 128],
+            stride_list=[32, 16, 8],
+            head_stride=out_ds_factor,
+            head_width=head_width,
+            head_depth=head_depth,
+            expand_ratio=4,
+            middle_op=middle_op,
+            final_expand=8,
+            n_classes=out_channels,
+            act_func="gelu",
+        )
+        # [optional] deactivate normalization
+        if not use_norm_params:
+            for module in self.modules():
+                if (
+                    isinstance(module, nn.LayerNorm)
+                    or isinstance(module, nn.BatchNorm2d)
+                    or isinstance(module, nn.BatchNorm1d)
+                ):
+                    module.weight.requires_grad_(False)
+                    module.bias.requires_grad_(False)
+    def forward(self, x):
+        feat = self.bbone(x)
+        out = self.head([feat[3], feat[2], feat[1]])
+        return out

utils.py ADDED Viewed

	@@ -0,0 +1,219 @@

+print("Importing standard...")
+import subprocess
+import shutil
+from pathlib import Path
+print("Importing external...")
+import torch
+import numpy as np
+from PIL import Image
+REDUCTION = "pca"
+if REDUCTION == "umap":
+    from umap import UMAP
+elif REDUCTION == "tsne":
+    from sklearn.manifold import TSNE
+elif REDUCTION == "pca":
+    from sklearn.decomposition import PCA
+def symlog(x):
+    return torch.sign(x) * torch.log(torch.abs(x) + 1)
+def preprocess_masks_features(masks, features):
+    # Get shapes right
+    B, M, H, W = masks.shape
+    Bf, F, Hf, Wf = features.shape
+    masks = masks.reshape(B, M, 1, H * W)
+    # # the following assertions should work, remove due to speed
+    # assert H == Hf and W == Wf and B == Bf
+    # assert masks.dtype == torch.bool
+    # assert (mask_areas > 0).all(), "you shouldn't have empty masks"
+    # Reduce M if there are empty masks
+    mask_areas = masks.sum(dim=3)  # B, M, 1
+    features = features.reshape(B, 1, F, H * W)
+    # output shapes
+    # features: B, 1, F, H*W
+    # masks: B, M, 1, H*W
+    return masks, features, M, B, H, W, F
+def get_row_col(H, W, device):
+    # get position of pixels in [0, 1]
+    row = torch.linspace(0, 1, H, device=device)
+    col = torch.linspace(0, 1, W, device=device)
+    return row, col
+def get_current_git_commit():
+    try:
+        # Run the git command to get the current commit hash
+        commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).strip()
+        # Decode from bytes to a string
+        return commit_hash.decode("utf-8")
+    except subprocess.CalledProcessError:
+        # Handle the case where the command fails (e.g., not a Git repository)
+        print("An error occurred while trying to retrieve the git commit hash.")
+        return None
+def clean_dir(dirname):
+    """Removes all directories in dirname that don't have a done.txt file"""
+    dstdir = Path(dirname)
+    dstdir.mkdir(exist_ok=True, parents=True)
+    for f in dstdir.iterdir():
+        # if the directory doesn't have a done.txt file remove it
+        if f.is_dir() and not (f / "done.txt").exists():
+            shutil.rmtree(f)
+def save_tensor_as_image(tensor, dstfile, global_step):
+    dstfile = Path(dstfile)
+    dstfile = (dstfile.parent / (dstfile.stem + "_" + str(global_step))).with_suffix(
+        ".jpg"
+    )
+    save(tensor, str(dstfile))
+def minmaxnorm(x):
+    return (x - x.min()) / (x.max() - x.min())
+def save(tensor, name, channel_offset=0):
+    tensor = to_img(tensor, channel_offset=channel_offset)
+    Image.fromarray(tensor).save(name)
+def to_img(tensor, channel_offset=0):
+    tensor = minmaxnorm(tensor)
+    tensor = (tensor * 255).to(torch.uint8)
+    C, H, W = tensor.shape
+    if tensor.shape[0] == 1:
+        tensor = tensor[0]
+    elif tensor.shape[0] == 2:
+        tensor = torch.stack([tensor[0], torch.zeros_like(tensor[0]), tensor[1]], dim=0)
+        tensor = tensor.permute(1, 2, 0)
+    elif tensor.shape[0] >= 3:
+        tensor = tensor[channel_offset : channel_offset + 3]
+        tensor = tensor.permute(1, 2, 0)
+    tensor = tensor.cpu().numpy()
+    return tensor
+def log_input_output(
+    name,
+    x,
+    y_hat,
+    global_step,
+    img_dstdir,
+    out_dstdir,
+    reduce_dim=True,
+    reduction=REDUCTION,
+    resample_size=20000,
+):
+    y_hat = y_hat.reshape(
+        y_hat.shape[0], y_hat.shape[2], y_hat.shape[3], y_hat.shape[4]
+    )
+    if reduce_dim and y_hat.shape[1] >= 3:
+        reducer = (
+            UMAP(n_components=3)
+            if (reduction == "umap")
+            else (
+                TSNE(n_components=3)
+                if reduction == "tsne"
+                else PCA(n_components=3)
+                if reduction == "pca"
+                else None
+            )
+        )
+        np_y_hat = y_hat.detach().cpu().permute(1, 0, 2, 3).numpy()  # F, 1, B, H, W
+        np_y_hat = np_y_hat.reshape(np_y_hat.shape[0], -1)  # F, BHW
+        np_y_hat = np_y_hat.T  # BHW, F
+        sampled_pixels = np_y_hat[:: np_y_hat.shape[0] // resample_size]
+        print("dim reduction fit..." + " " * 30, end="\r")
+        reducer = reducer.fit(sampled_pixels)
+        print("dim reduction transform..." + " " * 30, end="\r")
+        reducer.transform(np_y_hat[:10])  # to numba compile the function
+        np_y_hat = reducer.transform(np_y_hat)  # BHW, 3
+        # revert back to original shape
+        y_hat2 = (
+            torch.from_numpy(
+                np_y_hat.T.reshape(3, y_hat.shape[0], y_hat.shape[2], y_hat.shape[3])
+            )
+            .to(y_hat.device)
+            .permute(1, 0, 2, 3)
+        )
+        print("done" + " " * 30, end="\r")
+    else:
+        y_hat2 = y_hat
+    for i in range(min(len(x), 8)):
+        save_tensor_as_image(
+            x[i],
+            img_dstdir / f"input_{name}_{str(i).zfill(2)}",
+            global_step=global_step,
+        )
+        for c in range(y_hat.shape[1]):
+            save_tensor_as_image(
+                y_hat[i, c : c + 1],
+                out_dstdir / f"pred_channel_{name}_{str(i).zfill(2)}_{c}",
+                global_step=global_step,
+            )
+        # log color image
+        assert len(y_hat2.shape) == 4, "should be B, F, H, W"
+        if reduce_dim:
+            save_tensor_as_image(
+                y_hat2[i][:3],
+                out_dstdir / f"pred_reduced_{name}_{str(i).zfill(2)}",
+                global_step=global_step,
+            )
+        save_tensor_as_image(
+            y_hat[i][:3],
+            out_dstdir / f"pred_colorchs_{name}_{str(i).zfill(2)}",
+            global_step=global_step,
+        )
+def check_for_nan(loss, model, batch):
+    try:
+        assert torch.isnan(loss) == False
+    except Exception as e:
+        # print things useful to debug
+        # does the batch contain nan?
+        print("img batch contains nan?", torch.isnan(batch[0]).any())
+        print("mask batch contains nan?", torch.isnan(batch[1]).any())
+        # does the model weights contain nan?
+        for name, param in model.named_parameters():
+            if torch.isnan(param).any():
+                print(name, "contains nan")
+        # does the output contain nan?
+        print("output contains nan?", torch.isnan(model(batch[0])).any())
+        # now raise the error
+        raise e
+def calculate_iou(pred, label):
+    intersection = ((label == 1) & (pred == 1)).sum()
+    union = ((label == 1) | (pred == 1)).sum()
+    if not union:
+        return 0
+    else:
+        iou = intersection.item() / union.item()
+        return iou
+def load_from_ckpt(net, ckpt_path, strict=True):
+    """Load network weights"""
+    if ckpt_path and Path(ckpt_path).exists():
+        ckpt = torch.load(ckpt_path, map_location="cpu")
+        if "MODEL_STATE" in ckpt:
+            ckpt = ckpt["MODEL_STATE"]
+        elif "state_dict" in ckpt:
+            ckpt = ckpt["state_dict"]
+        net.load_state_dict(ckpt, strict=strict)
+        print("Loaded checkpoint from", ckpt_path)
+    return net