Spaces:

chongjie
/

PoseDiffusion_MVP

Runtime error

App Files Files Community

hugoycj commited on Jul 23, 2023

Commit

3d3e4e9

•

1 Parent(s): 7bf852f

Initial commit

Browse files

Files changed (24) hide show

.gitattributes +1 -0
.gitignore +1 -0
app.py +280 -0
cfgs/default.yaml +40 -0
cfgs/fast.yaml +40 -0
examples/71165193657__AED15223-1435-44B6-AFC1-884527CE1642.mp4 +3 -0
models/__init__.py +12 -0
models/denoiser.py +179 -0
models/gaussian_diffuser.py +410 -0
models/image_feature_extractor.py +108 -0
models/pose_diffusion_model.py +126 -0
packages.txt +0 -0
pre-requirements.txt +2 -0
requirements.txt +5 -0
util/__init__.py +7 -0
util/camera_transform.py +63 -0
util/embedding.py +62 -0
util/geometry_guided_sampling.py +215 -0
util/get_fundamental_matrix.py +57 -0
util/load_img_folder.py +157 -0
util/match_extraction.py +175 -0
util/metric.py +22 -0
util/utils.py +17 -0
weights/co3d_model_Apr16.pth +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from datetime import datetime
+import os
+import time
+import torch
+from typing import Dict, List, Optional, Union
+from omegaconf import OmegaConf, DictConfig
+import hydra
+from hydra.utils import instantiate, get_original_cwd
+import time
+from functools import partial
+import matplotlib.pyplot as plt
+import shutil
+from util.utils import seed_all_random_engines
+from util.match_extraction import extract_match
+from util.load_img_folder import load_and_preprocess_images
+from util.geometry_guided_sampling import geometry_guided_sampling
+from pytorch3d.vis.plotly_vis import get_camera_wireframe
+import subprocess
+import tempfile
+import gradio as gr
+def plot_cameras(ax, cameras, color: str = "blue"):
+    """
+    Plots a set of `cameras` objects into the maplotlib axis `ax` with
+    color `color`.
+    """
+    cam_wires_canonical = get_camera_wireframe().cuda()[None]
+    cam_trans = cameras.get_world_to_view_transform().inverse()
+    cam_wires_trans = cam_trans.transform_points(cam_wires_canonical)
+    plot_handles = []
+    for wire in cam_wires_trans:
+        # the Z and Y axes are flipped intentionally here!
+        x_, z_, y_ = wire.detach().cpu().numpy().T.astype(float)
+        (h,) = ax.plot(x_, y_, z_, color=color, linewidth=0.3)
+        plot_handles.append(h)
+    return plot_handles
+def create_matplotlib_figure(pred_cameras):
+    fig = plt.figure()
+    ax = fig.add_subplot(projection="3d")
+    ax.clear()
+    handle_cam = plot_cameras(ax, pred_cameras, color="#FF7D1E")
+    plot_radius = 3
+    ax.set_xlim3d([-plot_radius, plot_radius])
+    ax.set_ylim3d([3 - plot_radius, 3 + plot_radius])
+    ax.set_zlim3d([-plot_radius, plot_radius])
+    ax.set_xlabel("x")
+    ax.set_ylabel("z")
+    ax.set_zlabel("y")
+    labels_handles = {
+        "Estimated cameras": handle_cam[0],
+    }
+    ax.legend(
+        labels_handles.values(),
+        labels_handles.keys(),
+        loc="upper center",
+        bbox_to_anchor=(0.5, 0),
+    )
+    return plt
+import os
+import json
+import tempfile
+from PIL import Image
+def convert_extrinsics_pytorch3d_to_opengl(extrinsics: torch.Tensor) -> torch.Tensor:
+    """
+    Convert extrinsics from PyTorch3D coordinate system to OpenGL coordinate system.
+    Args:
+        extrinsics (torch.Tensor): a 4x4 extrinsic matrix in PyTorch3D coordinate system.
+    Returns:
+        torch.Tensor: a 4x4 extrinsic matrix in OpenGL coordinate system.
+    """
+    # Create a transformation matrix that flips the Z-axis
+    flip_z = torch.eye(4)
+    flip_z[2, 2] = -1
+    flip_z[0, 0] = -1
+    # Multiply the extrinsic matrix by the transformation matrix
+    extrinsics_opengl = torch.mm(extrinsics, flip_z)
+    return extrinsics_opengl
+import json
+from typing import List, Dict, Any
+def create_camera_json(extrinsics: Any, focal_length_world: float, principle_points: List[float], image_size: int) -> str:
+    # Initialize the dictionary
+    camera_dict = {
+        "w": image_size,
+        "h": image_size,
+        "fl_x": float(focal_length_world[0]),
+        "fl_y": float(focal_length_world[1]),
+        "cx": float(principle_points[0]),
+        "cy": float(principle_points[1]),
+        "k1": 0.0,  # Assuming these values are not provided
+        "k2": 0.0,  # Assuming these values are not provided
+        "p1": 0.0,  # Assuming these values are not provided
+        "p2": 0.0,  # Assuming these values are not provided
+        "camera_model": "OPENCV",
+        "frames": []
+    }
+    # Add frames to the dictionary
+    for i, extrinsic in enumerate(extrinsics):
+        frame = {
+            "file_path": f"images/frame_{str(i).zfill(5)}.jpg",
+            "transform_matrix": extrinsic.tolist(),
+            "colmap_im_id": i
+        }
+        # Convert numpy float32 to Python's native float
+        frame["transform_matrix"] = [[float(element) for element in row] for row in frame["transform_matrix"]]
+        camera_dict["frames"].append(frame)
+    return camera_dict
+def archieve_images_and_transforms(images, pred_cameras, image_size):
+    images_array = images.permute(0, 2, 3, 1).cpu().numpy() * 255
+    images_pil = [Image.fromarray(image.astype('uint8')) for image in images_array]
+    with tempfile.TemporaryDirectory() as temp_dir:
+        images_dir = os.path.join(temp_dir, 'images')
+        os.makedirs(images_dir, exist_ok=True)
+        images_path = []
+        for i, image in enumerate(images_pil):
+            image_path = os.path.join(images_dir, 'frame_{:05d}.jpg'.format(i))
+            image.save(image_path)
+            images_path.append(image_path)
+        cam_trans = pred_cameras.get_world_to_view_transform()
+        extrinsics = cam_trans.inverse().get_matrix().cpu()
+        extrinsics = [convert_extrinsics_pytorch3d_to_opengl(extrinsic.T) for extrinsic in extrinsics]
+        focal_length_ndc  = pred_cameras.focal_length.mean(dim=0).cpu().numpy()
+        focal_length_world = focal_length_ndc * image_size / 2
+        principle_points = [image_size / 2, image_size / 2]
+        camera_dict = create_camera_json(extrinsics, focal_length_world, principle_points, image_size)
+        json_path = os.path.join(temp_dir, 'transforms.json')
+        with open(json_path, 'w') as f:
+            json.dump(camera_dict, f, indent=4)
+        project_name = datetime.now().strftime("%Y%m%d-%H%M%S")
+        shutil.make_archive(f'/tmp/{project_name}', 'zip', temp_dir)
+    return f'/tmp/{project_name}.zip'
+def estimate_images_pose(image_folder, mode) -> None:
+    print("Slected mode:", mode)
+    with hydra.initialize(config_path="./cfgs/"):
+        cfg = hydra.compose(config_name=mode)
+    OmegaConf.set_struct(cfg, False)
+    print("Model Config:")
+    print(OmegaConf.to_yaml(cfg))
+    # Check for GPU availability and set the device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Instantiate the model
+    model = instantiate(cfg.MODEL, _recursive_=False)
+    # Load and preprocess images
+    images, image_info = load_and_preprocess_images(image_folder, cfg.image_size)
+    # Load checkpoint
+    ckpt_path = os.path.join(cfg.ckpt)
+    if os.path.isfile(ckpt_path):
+        checkpoint = torch.load(ckpt_path, map_location=device)
+        model.load_state_dict(checkpoint, strict=True)
+        print(f"Loaded checkpoint from: {ckpt_path}")
+    else:
+        raise ValueError(f"No checkpoint found at: {ckpt_path}")
+    # Move model and images to the GPU
+    model = model.to(device)
+    images = images.to(device)
+    # Evaluation Mode
+    model.eval()
+    # Seed random engines
+    seed_all_random_engines(cfg.seed)
+    # Start the timer
+    start_time = time.time()
+    # Perform match extraction
+    if cfg.GGS.enable:
+        # Optional TODO: remove the keypoints outside the cropped region?
+        kp1, kp2, i12 = extract_match(image_folder, image_info)
+        keys = ["kp1", "kp2", "i12", "img_shape"]
+        values = [kp1, kp2, i12, images.shape]
+        matches_dict = dict(zip(keys, values))
+        cfg.GGS.pose_encoding_type = cfg.MODEL.pose_encoding_type
+        GGS_cfg = OmegaConf.to_container(cfg.GGS)
+        cond_fn = partial(
+            geometry_guided_sampling, matches_dict=matches_dict, GGS_cfg=GGS_cfg
+        )
+        print("[92m=====> Sampling with GGS <=====[0m")
+    else:
+        cond_fn = None
+        print("[92m=====> Sampling without GGS <=====[0m")
+    # Forward
+    with torch.no_grad():
+        # Obtain predicted camera parameters
+        # pred_cameras is a PerspectiveCameras object with attributes
+        # pred_cameras.R, pred_cameras.T, pred_cameras.focal_length
+        # The poses and focal length are defined as
+        # NDC coordinate system in
+        # https://github.com/facebookresearch/pytorch3d/blob/main/docs/notes/cameras.md
+        pred_cameras = model(
+            image=images, cond_fn=cond_fn, cond_start_step=cfg.GGS.start_step
+        )
+    # Stop the timer and calculate elapsed time
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    print("Time taken: {:.4f} seconds".format(elapsed_time))
+    zip_path = archieve_images_and_transforms(images, pred_cameras, cfg.image_size)
+    return create_matplotlib_figure(pred_cameras), zip_path
+def extract_frames_from_video(video_path: str) -> str:
+    """
+    Extracts frames from a video file and saves them in a temporary directory.
+    Returns the path to the directory containing the frames.
+    """
+    temp_dir = tempfile.mkdtemp()
+    output_path = os.path.join(temp_dir, "%03d.jpg")
+    command = [
+        "ffmpeg",
+        "-i", video_path,
+        "-vf", "fps=1",
+        output_path
+    ]
+    subprocess.run(command, check=True)
+    return temp_dir
+def estimate_video_pose(video_path: str, mode: str) -> plt.Figure:
+    """
+    Estimates the pose of objects in a video.
+    """
+    # Extract frames from the video
+    image_folder = extract_frames_from_video(video_path)
+    # Estimate the pose for each frame
+    fig = estimate_images_pose(image_folder, mode)
+    return fig
+if __name__ == "__main__":
+    examples = [["examples/" + img, 'fast'] for img in os.listdir("examples/")]
+    # Create a Gradio interface
+    iface = gr.Interface(
+        fn=estimate_video_pose,
+        inputs=[gr.inputs.Video(label='video', type='mp4'),
+                gr.inputs.Radio(choices=['fast', 'precise'],  default='fast',
+                                label='Estimation Model, fast is quick, usually within 1 seconds; precise has higher accuracy, but usually take several minutes')],
+        outputs=['plot', 'file'],
+        title="PoseDiffusion Demo: Solving Pose Estimation via Diffusion-aided Bundle Adjustment",
+        description="Upload a video for object pose estimation. The object should be centrally located within the frame.",
+        examples=examples,
+        cache_examples=True
+    )
+    iface.launch()

cfgs/default.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+image_folder: samples/apple
+image_size: 224
+ckpt: weights/co3d_model_Apr16.pth
+seed: 0
+GGS:
+    enable: True
+    start_step: 10
+    learning_rate: 0.01
+    iter_num: 100
+    sampson_max: 10
+    min_matches: 10
+    alpha: 0.0001
+MODEL:
+    _target_: models.PoseDiffusionModel
+    pose_encoding_type: absT_quaR_logFL
+    IMAGE_FEATURE_EXTRACTOR:
+        _target_: models.MultiScaleImageFeatureExtractor
+        freeze: False
+    DENOISER:
+        _target_: models.Denoiser
+        TRANSFORMER:
+            _target_:               models.TransformerEncoderWrapper
+            d_model:                512
+            nhead:                  4
+            dim_feedforward:        1024
+            num_encoder_layers:     8
+            dropout:                0.1
+            batch_first:            True
+            norm_first:             True
+    DIFFUSER:
+        _target_: models.GaussianDiffusion
+        beta_schedule: custom

cfgs/fast.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+image_folder: samples/apple
+image_size: 224
+ckpt: weights/co3d_model_Apr16.pth
+seed: 0
+GGS:
+    enable: False
+    start_step: 10
+    learning_rate: 0.01
+    iter_num: 100
+    sampson_max: 10
+    min_matches: 10
+    alpha: 0.0001
+MODEL:
+    _target_: models.PoseDiffusionModel
+    pose_encoding_type: absT_quaR_logFL
+    IMAGE_FEATURE_EXTRACTOR:
+        _target_: models.MultiScaleImageFeatureExtractor
+        freeze: False
+    DENOISER:
+        _target_: models.Denoiser
+        TRANSFORMER:
+            _target_:               models.TransformerEncoderWrapper
+            d_model:                512
+            nhead:                  4
+            dim_feedforward:        1024
+            num_encoder_layers:     8
+            dropout:                0.1
+            batch_first:            True
+            norm_first:             True
+    DIFFUSER:
+        _target_: models.GaussianDiffusion
+        beta_schedule: custom

examples/71165193657__AED15223-1435-44B6-AFC1-884527CE1642.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:691ae5feb1286b531ad974a7e3a5859bb6de7e26b3e4f21eb208afc4af8038e7
+size 512683

models/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .pose_diffusion_model import PoseDiffusionModel
+from .denoiser import Denoiser, TransformerEncoderWrapper
+from .gaussian_diffuser import GaussianDiffusion
+from .image_feature_extractor import MultiScaleImageFeatureExtractor

models/denoiser.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from collections import defaultdict
+from dataclasses import field, dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union, Callable
+from util.embedding import TimeStepEmbedding, PoseEmbedding
+import torch
+import torch.nn as nn
+from hydra.utils import instantiate
+logger = logging.getLogger(__name__)
+class Denoiser(nn.Module):
+    def __init__(
+        self,
+        TRANSFORMER: Dict,
+        target_dim: int = 9,  # TODO: reduce fl dim from 2 to 1
+        pivot_cam_onehot: bool = True,
+        z_dim: int = 384,
+        mlp_hidden_dim: bool = 128,
+    ):
+        super().__init__()
+        self.pivot_cam_onehot = pivot_cam_onehot
+        self.target_dim = target_dim
+        self.time_embed = TimeStepEmbedding()
+        self.pose_embed = PoseEmbedding(target_dim=self.target_dim)
+        first_dim = (
+            self.time_embed.out_dim
+            + self.pose_embed.out_dim
+            + z_dim
+            + int(self.pivot_cam_onehot)
+        )
+        d_model = TRANSFORMER.d_model
+        self._first = nn.Linear(first_dim, d_model)
+        # slightly different from the paper that
+        # we use 2 encoder layers and 6 decoder layers
+        # here we use a transformer with 8 encoder layers
+        # call TransformerEncoderWrapper() to build a encoder-only transformer
+        self._trunk = instantiate(TRANSFORMER, _recursive_=False)
+        # TODO: change the implementation of MLP to a more mature one
+        self._last = MLP(
+            d_model,
+            [mlp_hidden_dim, self.target_dim],
+            norm_layer=nn.LayerNorm,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,  # B x N x dim
+        t: torch.Tensor,  # B
+        z: torch.Tensor,  # B x N x dim_z
+    ):
+        B, N, _ = x.shape
+        t_emb = self.time_embed(t)
+        # expand t from B x C to B x N x C
+        t_emb = t_emb.view(B, 1, t_emb.shape[-1]).expand(-1, N, -1)
+        x_emb = self.pose_embed(x)
+        if self.pivot_cam_onehot:
+            # add the one hot vector identifying the first camera as pivot
+            cam_pivot_id = torch.zeros_like(z[..., :1])
+            cam_pivot_id[:, 0, ...] = 1.0
+            z = torch.cat([z, cam_pivot_id], dim=-1)
+        feed_feats = torch.cat([x_emb, t_emb, z], dim=-1)
+        input_ = self._first(feed_feats)
+        feats_ = self._trunk(input_)
+        output = self._last(feats_)
+        return output
+def TransformerEncoderWrapper(
+    d_model: int,
+    nhead: int,
+    num_encoder_layers: int,
+    dim_feedforward: int = 2048,
+    dropout: float = 0.1,
+    norm_first: bool = True,
+    batch_first: bool = True,
+):
+    encoder_layer = torch.nn.TransformerEncoderLayer(
+        d_model=d_model,
+        nhead=nhead,
+        dim_feedforward=dim_feedforward,
+        dropout=dropout,
+        batch_first=batch_first,
+        norm_first=norm_first,
+    )
+    _trunk = torch.nn.TransformerEncoder(encoder_layer, num_encoder_layers)
+    return _trunk
+class MLP(torch.nn.Sequential):
+    """This block implements the multi-layer perceptron (MLP) module.
+    Args:
+        in_channels (int): Number of channels of the input
+        hidden_channels (List[int]): List of the hidden channel dimensions
+        norm_layer (Callable[..., torch.nn.Module], optional):
+            Norm layer that will be stacked on top of the convolution layer.
+            If ``None`` this layer wont be used. Default: ``None``
+        activation_layer (Callable[..., torch.nn.Module], optional):
+            Activation function which will be stacked on top of the
+            normalization layer (if not None), otherwise on top of the
+            conv layer. If ``None`` this layer wont be used.
+            Default: ``torch.nn.ReLU``
+        inplace (bool): Parameter for the activation layer, which can
+            optionally do the operation in-place. Default ``True``
+        bias (bool): Whether to use bias in the linear layer. Default ``True``
+        dropout (float): The probability for the dropout layer. Default: 0.0
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_channels: List[int],
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
+        activation_layer: Optional[
+            Callable[..., torch.nn.Module]
+        ] = torch.nn.ReLU,
+        inplace: Optional[bool] = True,
+        bias: bool = True,
+        norm_first: bool = False,
+        dropout: float = 0.0,
+    ):
+        # The addition of `norm_layer` is inspired from
+        # the implementation of TorchMultimodal:
+        # https://github.com/facebookresearch/multimodal/blob/5dec8a/torchmultimodal/modules/layers/mlp.py
+        params = {} if inplace is None else {"inplace": inplace}
+        layers = []
+        in_dim = in_channels
+        for hidden_dim in hidden_channels[:-1]:
+            if norm_first and norm_layer is not None:
+                layers.append(norm_layer(in_dim))
+            layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
+            if not norm_first and norm_layer is not None:
+                layers.append(norm_layer(hidden_dim))
+            layers.append(activation_layer(**params))
+            if dropout > 0:
+                layers.append(torch.nn.Dropout(dropout, **params))
+            in_dim = hidden_dim
+        if norm_first and norm_layer is not None:
+            layers.append(norm_layer(in_dim))
+        layers.append(torch.nn.Linear(in_dim, hidden_channels[-1], bias=bias))
+        if dropout > 0:
+            layers.append(torch.nn.Dropout(dropout, **params))
+        super().__init__(*layers)

models/gaussian_diffuser.py ADDED Viewed

	@@ -0,0 +1,410 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from https://github.com/lucidrains/denoising-diffusion-pytorch/blob/beb2f2d8dd9b4f2bd5be4719f37082fe061ee450/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+import math
+import copy
+from pathlib import Path
+from random import random
+from functools import partial
+from collections import namedtuple
+from multiprocessing import cpu_count
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import Adam
+from torchvision import transforms as T, utils
+from einops import rearrange, reduce
+from einops.layers.torch import Rearrange
+from PIL import Image
+from tqdm.auto import tqdm
+from typing import Any, Dict, List, Optional, Tuple, Union
+# constants
+ModelPrediction = namedtuple("ModelPrediction", ["pred_noise", "pred_x_start"])
+# helpers functions
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def linear_beta_schedule(timesteps):
+    scale = 1000 / timesteps
+    beta_start = scale * 0.0001
+    beta_end = scale * 0.02
+    return torch.linspace(beta_start, beta_end, timesteps, dtype=torch.float64)
+def cosine_beta_schedule(timesteps, s=0.008):
+    """
+    cosine schedule
+    as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
+    """
+    steps = timesteps + 1
+    x = torch.linspace(0, timesteps, steps, dtype=torch.float64)
+    alphas_cumprod = (
+        torch.cos(((x / timesteps) + s) / (1 + s) * math.pi * 0.5) ** 2
+    )
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    return torch.clip(betas, 0, 0.999)
+class GaussianDiffusion(nn.Module):
+    def __init__(
+        self,
+        timesteps=100,
+        sampling_timesteps=None,
+        beta_1=0.0001,
+        beta_T=0.1,
+        loss_type="l1",
+        objective="pred_noise",
+        beta_schedule="custom",
+        p2_loss_weight_gamma=0.0,
+        p2_loss_weight_k=1,
+    ):
+        super().__init__()
+        self.objective = objective
+        assert objective in {
+            "pred_noise",
+            "pred_x0",
+        }, "objective must be either pred_noise (predict noise) \
+            or pred_x0 (predict image start)"
+        self.timesteps = timesteps
+        self.sampling_timesteps = sampling_timesteps
+        self.beta_1 = beta_1
+        self.beta_T = beta_T
+        self.loss_type = loss_type
+        self.objective = objective
+        self.beta_schedule = beta_schedule
+        self.p2_loss_weight_gamma = p2_loss_weight_gamma
+        self.p2_loss_weight_k = p2_loss_weight_k
+        self.init_diff_hyper(
+            self.timesteps,
+            self.sampling_timesteps,
+            self.beta_1,
+            self.beta_T,
+            self.loss_type,
+            self.objective,
+            self.beta_schedule,
+            self.p2_loss_weight_gamma,
+            self.p2_loss_weight_k,
+        )
+    def init_diff_hyper(
+        self,
+        timesteps,
+        sampling_timesteps,
+        beta_1,
+        beta_T,
+        loss_type,
+        objective,
+        beta_schedule,
+        p2_loss_weight_gamma,
+        p2_loss_weight_k,
+    ):
+        if beta_schedule == "linear":
+            betas = linear_beta_schedule(timesteps)
+        elif beta_schedule == "cosine":
+            betas = cosine_beta_schedule(timesteps)
+        elif beta_schedule == "custom":
+            betas = torch.linspace(
+                beta_1, beta_T, timesteps, dtype=torch.float64
+            )
+        else:
+            raise ValueError(f"unknown beta schedule {beta_schedule}")
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value=1.0)
+        (timesteps,) = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.loss_type = loss_type
+        # sampling related parameters
+        self.sampling_timesteps = default(
+            sampling_timesteps, timesteps
+        )  # default num sampling timesteps to number of timesteps at training
+        assert self.sampling_timesteps <= timesteps
+        # helper function to register buffer from float64 to float32
+        register_buffer = lambda name, val: self.register_buffer(
+            name, val.to(torch.float32)
+        )
+        register_buffer("betas", betas)
+        register_buffer("alphas_cumprod", alphas_cumprod)
+        register_buffer("alphas_cumprod_prev", alphas_cumprod_prev)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        register_buffer("sqrt_alphas_cumprod", torch.sqrt(alphas_cumprod))
+        register_buffer(
+            "sqrt_one_minus_alphas_cumprod", torch.sqrt(1.0 - alphas_cumprod)
+        )
+        register_buffer(
+            "log_one_minus_alphas_cumprod", torch.log(1.0 - alphas_cumprod)
+        )
+        register_buffer(
+            "sqrt_recip_alphas_cumprod", torch.sqrt(1.0 / alphas_cumprod)
+        )
+        register_buffer(
+            "sqrt_recipm1_alphas_cumprod", torch.sqrt(1.0 / alphas_cumprod - 1)
+        )
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = (
+            betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+        )
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        register_buffer("posterior_variance", posterior_variance)
+        # below: log calculation clipped because the posterior variance is 0
+        # at the beginning of the diffusion chain
+        register_buffer(
+            "posterior_log_variance_clipped",
+            torch.log(posterior_variance.clamp(min=1e-20)),
+        )
+        register_buffer(
+            "posterior_mean_coef1",
+            betas * torch.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod),
+        )
+        register_buffer(
+            "posterior_mean_coef2",
+            (1.0 - alphas_cumprod_prev)
+            * torch.sqrt(alphas)
+            / (1.0 - alphas_cumprod),
+        )
+        # calculate p2 reweighting
+        register_buffer(
+            "p2_loss_weight",
+            (p2_loss_weight_k + alphas_cumprod / (1 - alphas_cumprod))
+            ** -p2_loss_weight_gamma,
+        )
+    # helper functions
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+            extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+        )
+    def predict_noise_from_start(self, x_t, t, x0):
+        return (
+            extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0
+        ) / extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+            extract(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        return (
+            posterior_mean,
+            posterior_variance,
+            posterior_log_variance_clipped,
+        )
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
+            * noise
+        )
+    def model_predictions(self, x, t, z, x_self_cond=None):
+        model_output = self.model(x, t, z)
+        if self.objective == "pred_noise":
+            pred_noise = model_output
+            x_start = self.predict_start_from_noise(x, t, model_output)
+        elif self.objective == "pred_x0":
+            pred_noise = self.predict_noise_from_start(x, t, model_output)
+            x_start = model_output
+        return ModelPrediction(pred_noise, x_start)
+    def p_mean_variance(
+        self,
+        x: torch.Tensor,  # B x N_x x dim
+        t: int,
+        z: torch.Tensor,
+        x_self_cond=None,
+        clip_denoised=False,
+    ):
+        preds = self.model_predictions(x, t, z)
+        x_start = preds.pred_x_start
+        if clip_denoised:
+            raise NotImplementedError(
+                "We don't clip the output because \
+                    pose does not have a clear bound."
+            )
+        (
+            model_mean,
+            posterior_variance,
+            posterior_log_variance,
+        ) = self.q_posterior(x_start=x_start, x_t=x, t=t)
+        return model_mean, posterior_variance, posterior_log_variance, x_start
+    @torch.no_grad()
+    def p_sample(
+        self,
+        x: torch.Tensor,  # B x N_x x dim
+        t: int,
+        z: torch.Tensor,
+        x_self_cond=None,
+        clip_denoised=False,
+        cond_fn=None,
+        cond_start_step=0,
+    ):
+        b, *_, device = *x.shape, x.device
+        batched_times = torch.full(
+            (x.shape[0],), t, device=x.device, dtype=torch.long
+        )
+        model_mean, _, model_log_variance, x_start = self.p_mean_variance(
+            x=x,
+            t=batched_times,
+            z=z,
+            x_self_cond=x_self_cond,
+            clip_denoised=clip_denoised,
+        )
+        if cond_fn is not None and t < cond_start_step:
+            model_mean = cond_fn(model_mean, t)
+            noise = 0.0
+        else:
+            noise = torch.randn_like(x) if t > 0 else 0.0  # no noise if t == 0
+        pred = model_mean + (0.5 * model_log_variance).exp() * noise
+        return pred, x_start
+    @torch.no_grad()
+    def p_sample_loop(
+        self,
+        shape,
+        z: torch.Tensor,
+        cond_fn=None,
+        cond_start_step=0,
+    ):
+        batch, device = shape[0], self.betas.device
+        # Init here
+        pose = torch.randn(shape, device=device)
+        x_start = None
+        pose_process = []
+        pose_process.append(pose.unsqueeze(0))
+        for t in reversed(range(0, self.num_timesteps)):
+            pose, _ = self.p_sample(
+                x=pose,
+                t=t,
+                z=z,
+                cond_fn=cond_fn,
+                cond_start_step=cond_start_step,
+            )
+            pose_process.append(pose.unsqueeze(0))
+        return pose, torch.cat(pose_process)
+    @torch.no_grad()
+    def sample(self, shape, z, cond_fn=None, cond_start_step=0):
+        # TODO: add more variants
+        sample_fn = self.p_sample_loop
+        return sample_fn(
+            shape, z=z, cond_fn=cond_fn, cond_start_step=cond_start_step
+        )
+    def p_losses(
+        self,
+        x_start,
+        t,
+        z=None,
+        noise=None,
+    ):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        # noise sample
+        x = self.q_sample(x_start=x_start, t=t, noise=noise)
+        model_out = self.model(x, t, z)
+        if self.objective == "pred_noise":
+            target = noise
+            x_0_pred = self.predict_start_from_noise(x, t, model_out)
+        elif self.objective == "pred_x0":
+            target = x_start
+            x_0_pred = model_out
+        else:
+            raise ValueError(f"unknown objective {self.objective}")
+        loss = self.loss_fn(model_out, target, reduction="none")
+        loss = reduce(loss, "b ... -> b (...)", "mean")
+        loss = loss * extract(self.p2_loss_weight, t, loss.shape)
+        return {
+            "loss": loss,
+            "noise": noise,
+            "x_0_pred": x_0_pred,
+            "x_t": x,
+            "t": t,
+        }
+    def forward(self, pose, z=None, *args, **kwargs):
+        b = len(pose)
+        t = torch.randint(
+            0, self.num_timesteps, (b,), device=pose.device
+        ).long()
+        return self.p_losses(pose, t, z=z, *args, **kwargs)
+    @property
+    def loss_fn(self):
+        if self.loss_type == "l1":
+            return F.l1_loss
+        elif self.loss_type == "l2":
+            return F.mse_loss
+        else:
+            raise ValueError(f"invalid loss type {self.loss_type}")

models/image_feature_extractor.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+import warnings
+from collections import defaultdict
+from dataclasses import field, dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+import torchvision
+import io
+from PIL import Image
+import numpy as np
+logger = logging.getLogger(__name__)
+_RESNET_MEAN = [0.485, 0.456, 0.406]
+_RESNET_STD = [0.229, 0.224, 0.225]
+class MultiScaleImageFeatureExtractor(nn.Module):
+    def __init__(
+        self,
+        modelname: str = "dino_vits16",
+        freeze: bool = False,
+        scale_factors: list = [1, 1 / 2, 1 / 3],
+    ):
+        super().__init__()
+        self.freeze = freeze
+        self.scale_factors = scale_factors
+        if "res" in modelname:
+            self._net = getattr(torchvision.models, modelname)(pretrained=True)
+            self._output_dim = self._net.fc.weight.shape[1]
+            self._net.fc = nn.Identity()
+        elif "dino" in modelname:
+            self._net = torch.hub.load("facebookresearch/dino:main", modelname)
+            self._output_dim = self._net.norm.weight.shape[0]
+        else:
+            raise ValueError(f"Unknown model name {modelname}")
+        for name, value in (
+            ("_resnet_mean", _RESNET_MEAN),
+            ("_resnet_std", _RESNET_STD),
+        ):
+            self.register_buffer(
+                name,
+                torch.FloatTensor(value).view(1, 3, 1, 1),
+                persistent=False,
+            )
+        if self.freeze:
+            for param in self.parameters():
+                param.requires_grad = False
+    def get_output_dim(self):
+        return self._output_dim
+    def forward(self, image_rgb: torch.Tensor) -> torch.Tensor:
+        img_normed = self._resnet_normalize_image(image_rgb)
+        features = self._compute_multiscale_features(img_normed)
+        return features
+    def _resnet_normalize_image(self, img: torch.Tensor) -> torch.Tensor:
+        return (img - self._resnet_mean) / self._resnet_std
+    def _compute_multiscale_features(
+        self, img_normed: torch.Tensor
+    ) -> torch.Tensor:
+        multiscale_features = None
+        if len(self.scale_factors) <= 0:
+            raise ValueError(
+                f"Wrong format of self.scale_factors: {self.scale_factors}"
+            )
+        for scale_factor in self.scale_factors:
+            if scale_factor == 1:
+                inp = img_normed
+            else:
+                inp = self._resize_image(img_normed, scale_factor)
+            if multiscale_features is None:
+                multiscale_features = self._net(inp)
+            else:
+                multiscale_features += self._net(inp)
+        averaged_features = multiscale_features / len(self.scale_factors)
+        return averaged_features
+    @staticmethod
+    def _resize_image(image: torch.Tensor, scale_factor: float) -> torch.Tensor:
+        return nn.functional.interpolate(
+            image,
+            scale_factor=scale_factor,
+            mode="bilinear",
+            align_corners=False,
+        )

models/pose_diffusion_model.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Standard library imports
+import base64
+import io
+import logging
+import math
+import pickle
+import warnings
+from collections import defaultdict
+from dataclasses import field, dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+# Third-party library imports
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.transforms import (
+    se3_exp_map,
+    se3_log_map,
+    Transform3d,
+    so3_relative_angle,
+)
+from util.camera_transform import pose_encoding_to_camera
+import models
+from hydra.utils import instantiate
+from pytorch3d.renderer.cameras import PerspectiveCameras
+logger = logging.getLogger(__name__)
+class PoseDiffusionModel(nn.Module):
+    def __init__(
+        self,
+        pose_encoding_type: str,
+        IMAGE_FEATURE_EXTRACTOR: Dict,
+        DIFFUSER: Dict,
+        DENOISER: Dict,
+    ):
+        """Initializes a PoseDiffusion model.
+        Args:
+            pose_encoding_type (str):
+                Defines the encoding type for extrinsics and intrinsics
+                Currently, only `"absT_quaR_logFL"` is supported -
+                a concatenation of the translation vector,
+                rotation quaternion, and logarithm of focal length.
+            image_feature_extractor_cfg (Dict):
+                Configuration for the image feature extractor.
+            diffuser_cfg (Dict):
+                Configuration for the diffuser.
+            denoiser_cfg (Dict):
+                Configuration for the denoiser.
+        """
+        super().__init__()
+        self.pose_encoding_type = pose_encoding_type
+        self.image_feature_extractor = instantiate(
+            IMAGE_FEATURE_EXTRACTOR, _recursive_=False
+        )
+        self.diffuser = instantiate(DIFFUSER, _recursive_=False)
+        denoiser = instantiate(DENOISER, _recursive_=False)
+        self.diffuser.model = denoiser
+        self.target_dim = denoiser.target_dim
+    def forward(
+        self,
+        image: torch.Tensor,
+        gt_cameras: Optional[CamerasBase] = None,
+        sequence_name: Optional[List[str]] = None,
+        cond_fn=None,
+        cond_start_step=0,
+    ):
+        """
+        Forward pass of the PoseDiffusionModel.
+        Args:
+            image (torch.Tensor):
+                Input image tensor, Bx3xHxW.
+            gt_cameras (Optional[CamerasBase], optional):
+                Camera object. Defaults to None.
+            sequence_name (Optional[List[str]], optional):
+                List of sequence names. Defaults to None.
+            cond_fn ([type], optional):
+                Conditional function. Wrapper for GGS or other functions.
+            cond_start_step (int, optional):
+                The sampling step to start using conditional function.
+        Returns:
+            PerspectiveCameras: PyTorch3D camera object.
+        """
+        z = self.image_feature_extractor(image)
+        z = z.unsqueeze(0)
+        B, N, _ = z.shape
+        target_shape = [B, N, self.target_dim]
+        # sampling
+        pose_encoding, pose_encoding_diffusion_samples = self.diffuser.sample(
+            shape=target_shape,
+            z=z,
+            cond_fn=cond_fn,
+            cond_start_step=cond_start_step,
+        )
+        # convert the encoded representation to PyTorch3D cameras
+        pred_cameras = pose_encoding_to_camera(
+            pose_encoding, pose_encoding_type=self.pose_encoding_type
+        )
+        return pred_cameras

packages.txt ADDED Viewed

File without changes

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch==1.13.0
2	+ torchvision==0.14.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+hydra-core
+omegaconf
+opencv-python
+einops
+git+https://github.com/facebookresearch/PoseDiffusion.git

util/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

util/camera_transform.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from pytorch3d.transforms.rotation_conversions import (
+    matrix_to_quaternion,
+    quaternion_to_matrix,
+)
+from pytorch3d.renderer.cameras import CamerasBase, PerspectiveCameras
+def pose_encoding_to_camera(
+    pose_encoding,
+    pose_encoding_type="absT_quaR_logFL",
+    log_focal_length_bias=1.8,
+    min_focal_length=0.1,
+    max_focal_length=20,
+):
+    """
+    Args:
+        pose_encoding: A tensor of shape `BxNxC`, containing a batch of
+                        `BxN` `C`-dimensional pose encodings.
+        pose_encoding_type: The type of pose encoding,
+                        only "absT_quaR_logFL" is supported.
+    """
+    batch_size, num_poses, _ = pose_encoding.shape
+    pose_encoding_reshaped = pose_encoding.reshape(
+        -1, pose_encoding.shape[-1]
+    )  # Reshape to BNxC
+    if pose_encoding_type == "absT_quaR_logFL":
+        # forced that 3 for absT, 4 for quaR, 2 logFL
+        # TODO: converted to 1 dim for logFL, consistent with our paper
+        abs_T = pose_encoding_reshaped[:, :3]
+        quaternion_R = pose_encoding_reshaped[:, 3:7]
+        R = quaternion_to_matrix(quaternion_R)
+        log_focal_length = pose_encoding_reshaped[:, 7:9]
+        # log_focal_length_bias was the hyperparameter
+        # to ensure the mean of logFL close to 0 during training
+        # Now converted back
+        focal_length = (log_focal_length + log_focal_length_bias).exp()
+        # clamp to avoid weird fl values
+        focal_length = torch.clamp(
+            focal_length, min=min_focal_length, max=max_focal_length
+        )
+    else:
+        raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
+    pred_cameras = PerspectiveCameras(
+        focal_length=focal_length,
+        R=R,
+        T=abs_T,
+        device=R.device,
+    )
+    return pred_cameras

util/embedding.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import math
+from pytorch3d.renderer import HarmonicEmbedding
+class TimeStepEmbedding(nn.Module):
+    # learned from https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/nn.py
+    def __init__(self, dim=256, max_period=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_period = max_period
+        self.linear = nn.Sequential(
+            nn.Linear(dim, dim // 2),
+            nn.SiLU(),
+            nn.Linear(dim // 2, dim // 2),
+        )
+        self.out_dim = dim // 2
+    def _compute_freqs(self, half):
+        freqs = torch.exp(
+            -math.log(self.max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        )
+        return freqs
+    def forward(self, timesteps):
+        half = self.dim // 2
+        freqs = self._compute_freqs(half).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if self.dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        output = self.linear(embedding)
+        return output
+class PoseEmbedding(nn.Module):
+    def __init__(self, target_dim, n_harmonic_functions=10, append_input=True):
+        super().__init__()
+        self._emb_pose = HarmonicEmbedding(
+            n_harmonic_functions=n_harmonic_functions, append_input=append_input
+        )
+        self.out_dim = self._emb_pose.get_output_dim(target_dim)
+    def forward(self, pose_encoding):
+        e_pose_encoding = self._emb_pose(pose_encoding)
+        return e_pose_encoding

util/geometry_guided_sampling.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from typing import Dict, List, Optional, Union
+from util.camera_transform import pose_encoding_to_camera
+from util.get_fundamental_matrix import get_fundamental_matrices
+from pytorch3d.renderer.cameras import CamerasBase, PerspectiveCameras
+def geometry_guided_sampling(
+    model_mean: torch.Tensor,
+    t: int,
+    matches_dict: Dict,
+    GGS_cfg: Dict,
+):
+    # pre-process matches
+    b, c, h, w = matches_dict["img_shape"]
+    device = model_mean.device
+    def _to_device(tensor):
+        return torch.from_numpy(tensor).to(device)
+    kp1 = _to_device(matches_dict["kp1"])
+    kp2 = _to_device(matches_dict["kp2"])
+    i12 = _to_device(matches_dict["i12"])
+    pair_idx = i12[:, 0] * b + i12[:, 1]
+    pair_idx = pair_idx.long()
+    def _to_homogeneous(tensor):
+        return torch.nn.functional.pad(tensor, [0, 1], value=1)
+    kp1_homo = _to_homogeneous(kp1)
+    kp2_homo = _to_homogeneous(kp2)
+    i1, i2 = [
+        i.reshape(-1) for i in torch.meshgrid(torch.arange(b), torch.arange(b))
+    ]
+    processed_matches = {
+        "kp1_homo": kp1_homo,
+        "kp2_homo": kp2_homo,
+        "i1": i1,
+        "i2": i2,
+        "h": h,
+        "w": w,
+        "pair_idx": pair_idx,
+    }
+    # conduct GGS
+    model_mean = GGS_optimize(model_mean, t, processed_matches, **GGS_cfg)
+    # Optimize FL, R, and T separately
+    model_mean = GGS_optimize(
+        model_mean,
+        t,
+        processed_matches,
+        update_T=False,
+        update_R=False,
+        update_FL=True,
+        **GGS_cfg,
+    )  # only optimize FL
+    model_mean = GGS_optimize(
+        model_mean,
+        t,
+        processed_matches,
+        update_T=False,
+        update_R=True,
+        update_FL=False,
+        **GGS_cfg,
+    )  # only optimize R
+    model_mean = GGS_optimize(
+        model_mean,
+        t,
+        processed_matches,
+        update_T=True,
+        update_R=False,
+        update_FL=False,
+        **GGS_cfg,
+    )  # only optimize T
+    model_mean = GGS_optimize(model_mean, t, processed_matches, **GGS_cfg)
+    return model_mean
+def GGS_optimize(
+    model_mean: torch.Tensor,
+    t: int,
+    processed_matches: Dict,
+    update_R: bool = True,
+    update_T: bool = True,
+    update_FL: bool = True,
+    # the args below come from **GGS_cfg
+    alpha: float = 0.0001,
+    learning_rate: float = 1e-2,
+    iter_num: int = 100,
+    sampson_max: int = 10,
+    min_matches: int = 10,
+    pose_encoding_type: str = "absT_quaR_logFL",
+    **kwargs,
+):
+    with torch.enable_grad():
+        model_mean.requires_grad_(True)
+        if update_R and update_T and update_FL:
+            iter_num = iter_num * 2
+        optimizer = torch.optim.SGD(
+            [model_mean], lr=learning_rate, momentum=0.9
+        )
+        batch_size = model_mean.shape[1]
+        for _ in range(iter_num):
+            valid_sampson, sampson_to_print = compute_sampson_distance(
+                model_mean,
+                t,
+                processed_matches,
+                update_R=update_R,
+                update_T=update_T,
+                update_FL=update_FL,
+                pose_encoding_type=pose_encoding_type,
+                sampson_max=sampson_max,
+            )
+            if min_matches > 0:
+                valid_match_per_frame = len(valid_sampson) / batch_size
+                if valid_match_per_frame < min_matches:
+                    print(
+                        "Drop this pair because of insufficient valid matches"
+                    )
+                    break
+            loss = valid_sampson.mean()
+            optimizer.zero_grad()
+            loss.backward()
+            grads = model_mean.grad
+            grad_norm = grads.norm()
+            grad_mask = (grads.abs() > 0).detach()
+            model_mean_norm = (model_mean * grad_mask).norm()
+            max_norm = alpha * model_mean_norm / learning_rate
+            total_norm = torch.nn.utils.clip_grad_norm_(model_mean, max_norm)
+            optimizer.step()
+        print(f"t={t:02d} | sampson={sampson_to_print:05f}")
+        model_mean = model_mean.detach()
+    return model_mean
+def compute_sampson_distance(
+    model_mean: torch.Tensor,
+    t: int,
+    processed_matches: Dict,
+    update_R=True,
+    update_T=True,
+    update_FL=True,
+    pose_encoding_type: str = "absT_quaR_logFL",
+    sampson_max: int = 10,
+):
+    camera = pose_encoding_to_camera(model_mean, pose_encoding_type)
+    # pick the mean of the predicted focal length
+    camera.focal_length = camera.focal_length.mean(dim=0).repeat(
+        len(camera.focal_length), 1
+    )
+    if not update_R:
+        camera.R = camera.R.detach()
+    if not update_T:
+        camera.T = camera.T.detach()
+    if not update_FL:
+        camera.focal_length = camera.focal_length.detach()
+    kp1_homo, kp2_homo, i1, i2, he, wi, pair_idx = processed_matches.values()
+    F_2_to_1 = get_fundamental_matrices(
+        camera, he, wi, i1, i2, l2_normalize_F=False
+    )
+    F = F_2_to_1.permute(0, 2, 1)  # y1^T F y2 = 0
+    def _sampson_distance(F, kp1_homo, kp2_homo, pair_idx):
+        left = torch.bmm(kp1_homo[:, None], F[pair_idx])
+        right = torch.bmm(F[pair_idx], kp2_homo[..., None])
+        bottom = (
+            left[:, :, 0].square()
+            + left[:, :, 1].square()
+            + right[:, 0, :].square()
+            + right[:, 1, :].square()
+        )
+        top = torch.bmm(left, kp2_homo[..., None]).square()
+        sampson = top[:, 0] / bottom
+        return sampson
+    sampson = _sampson_distance(
+        F,
+        kp1_homo.float(),
+        kp2_homo.float(),
+        pair_idx,
+    )
+    sampson_to_print = sampson.detach().clone().clamp(max=sampson_max).mean()
+    sampson = sampson[sampson < sampson_max]
+    return sampson, sampson_to_print

util/get_fundamental_matrix.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import pytorch3d
+from pytorch3d.utils import opencv_from_cameras_projection
+from pytorch3d.transforms.so3 import hat
+from pytorch3d.renderer.cameras import CamerasBase, PerspectiveCameras
+def get_fundamental_matrices(
+    camera: CamerasBase,
+    height: int,
+    width: int,
+    index1: torch.LongTensor,
+    index2: torch.LongTensor,
+    l2_normalize_F=False,
+):
+    """Compute fundamental matrices for given camera parameters."""
+    batch_size = camera.R.shape[0]
+    # Convert to opencv / colmap / Hartley&Zisserman convention
+    image_size_t = (
+        torch.LongTensor([height, width])[None]
+        .repeat(batch_size, 1)
+        .to(camera.device)
+    )
+    R, t, K = opencv_from_cameras_projection(camera, image_size=image_size_t)
+    F, E = get_fundamental_matrix(
+        K[index1], R[index1], t[index1], K[index2], R[index2], t[index2]
+    )
+    if l2_normalize_F:
+        F_scale = torch.norm(F, dim=(1, 2))
+        F_scale = F_scale.clamp(min=0.0001)
+        F = F / F_scale[:, None, None]
+    return F
+def get_fundamental_matrix(K1, R1, t1, K2, R2, t2):
+    E = get_essential_matrix(R1, t1, R2, t2)
+    F = K2.inverse().permute(0, 2, 1).matmul(E).matmul(K1.inverse())
+    return F, E  # p2^T F p1 = 0
+def get_essential_matrix(R1, t1, R2, t2):
+    R12 = R2.matmul(R1.permute(0, 2, 1))
+    t12 = t2 - R12.matmul(t1[..., None])[..., 0]
+    E_R = R12
+    E_t = -E_R.permute(0, 2, 1).matmul(t12[..., None])[..., 0]
+    E = E_R.matmul(hat(E_t))
+    return E

util/load_img_folder.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn.functional as F
+from typing import (
+    Any,
+    ClassVar,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
+def load_and_preprocess_images(
+    folder_path: str, image_size: int = 224, mode: str = "bilinear"
+) -> torch.Tensor:
+    image_paths = [
+        os.path.join(folder_path, file)
+        for file in os.listdir(folder_path)
+        if file.lower().endswith((".png", ".jpg", ".jpeg"))
+    ]
+    image_paths.sort()
+    images = []
+    bboxes_xyxy = []
+    scales = []
+    for path in image_paths:
+        image = _load_image(path)
+        image, bbox_xyxy, min_hw = _center_crop_square(image)
+        minscale = image_size / min_hw
+        imre = F.interpolate(
+            torch.from_numpy(image)[None],
+            size=(image_size, image_size),
+            mode=mode,
+            align_corners=False if mode == "bilinear" else None,
+        )[0]
+        images.append(imre.numpy())
+        bboxes_xyxy.append(bbox_xyxy.numpy())
+        scales.append(minscale)
+    images_tensor = torch.from_numpy(np.stack(images))
+    # assume all the images have the same shape for GGS
+    image_info = {
+        "size": (min_hw, min_hw),
+        "bboxes_xyxy": np.stack(bboxes_xyxy),
+        "resized_scales": np.stack(scales),
+    }
+    return images_tensor, image_info
+# helper functions
+def _load_image(path) -> np.ndarray:
+    with Image.open(path) as pil_im:
+        im = np.array(pil_im.convert("RGB"))
+    im = im.transpose((2, 0, 1))
+    im = im.astype(np.float32) / 255.0
+    return im
+def _center_crop_square(image: np.ndarray) -> np.ndarray:
+    h, w = image.shape[1:]
+    min_dim = min(h, w)
+    top = (h - min_dim) // 2
+    left = (w - min_dim) // 2
+    cropped_image = image[:, top : top + min_dim, left : left + min_dim]
+    # bbox_xywh: the cropped region
+    bbox_xywh = torch.tensor([left, top, min_dim, min_dim])
+    # the format from xywh to xyxy
+    bbox_xyxy = _clamp_box_to_image_bounds_and_round(
+        _get_clamp_bbox(
+            bbox_xywh,
+            box_crop_context=0.0,
+        ),
+        image_size_hw=(h, w),
+    )
+    return cropped_image, bbox_xyxy, min_dim
+def _get_clamp_bbox(
+    bbox: torch.Tensor,
+    box_crop_context: float = 0.0,
+) -> torch.Tensor:
+    # box_crop_context: rate of expansion for bbox
+    # returns possibly expanded bbox xyxy as float
+    bbox = bbox.clone()  # do not edit bbox in place
+    # increase box size
+    if box_crop_context > 0.0:
+        c = box_crop_context
+        bbox = bbox.float()
+        bbox[0] -= bbox[2] * c / 2
+        bbox[1] -= bbox[3] * c / 2
+        bbox[2] += bbox[2] * c
+        bbox[3] += bbox[3] * c
+    if (bbox[2:] <= 1.0).any():
+        raise ValueError(
+            f"squashed image!! The bounding box contains no pixels."
+        )
+    bbox[2:] = torch.clamp(
+        bbox[2:], 2
+    )  # set min height, width to 2 along both axes
+    bbox_xyxy = _bbox_xywh_to_xyxy(bbox, clamp_size=2)
+    return bbox_xyxy
+def _bbox_xywh_to_xyxy(
+    xywh: torch.Tensor, clamp_size: Optional[int] = None
+) -> torch.Tensor:
+    xyxy = xywh.clone()
+    if clamp_size is not None:
+        xyxy[2:] = torch.clamp(xyxy[2:], clamp_size)
+    xyxy[2:] += xyxy[:2]
+    return xyxy
+def _clamp_box_to_image_bounds_and_round(
+    bbox_xyxy: torch.Tensor,
+    image_size_hw: Tuple[int, int],
+) -> torch.LongTensor:
+    bbox_xyxy = bbox_xyxy.clone()
+    bbox_xyxy[[0, 2]] = torch.clamp(bbox_xyxy[[0, 2]], 0, image_size_hw[-1])
+    bbox_xyxy[[1, 3]] = torch.clamp(bbox_xyxy[[1, 3]], 0, image_size_hw[-2])
+    if not isinstance(bbox_xyxy, torch.LongTensor):
+        bbox_xyxy = bbox_xyxy.round().long()
+    return bbox_xyxy  # pyre-ignore [7]
+if __name__ == "__main__":
+    # Example usage:
+    folder_path = "path/to/your/folder"
+    image_size = 224
+    images_tensor = load_and_preprocess_images(folder_path, image_size)
+    print(images_tensor.shape)

util/match_extraction.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import shutil
+import tempfile
+from pathlib import Path
+import numpy as np
+import pycolmap
+from typing import Optional, List, Dict, Any
+from hloc import (
+    extract_features,
+    logger,
+    match_features,
+    pairs_from_exhaustive,
+)
+from hloc.triangulation import (
+    import_features,
+    import_matches,
+    estimation_and_geometric_verification,
+    parse_option_args,
+    OutputCapture,
+)
+from hloc.utils.database import (
+    COLMAPDatabase,
+    image_ids_to_pair_id,
+    pair_id_to_image_ids,
+)
+from hloc.reconstruction import create_empty_db, import_images, get_image_ids
+def extract_match(image_folder_path: str, image_info: Dict):
+    # Now only supports SPSG
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_mapping = os.path.join(tmpdir, "mapping")
+        os.makedirs(tmp_mapping)
+        for filename in os.listdir(image_folder_path):
+            if filename.lower().endswith(
+                (".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff")
+            ):
+                shutil.copy(
+                    os.path.join(image_folder_path, filename),
+                    os.path.join(tmp_mapping, filename),
+                )
+        matches, keypoints = run_hloc(tmpdir)
+    # From the format of colmap to PyTorch3D
+    kp1, kp2, i12 = colmap_keypoint_to_pytorch3d(matches, keypoints, image_info)
+    return kp1, kp2, i12
+def colmap_keypoint_to_pytorch3d(matches, keypoints, image_info):
+    kp1, kp2, i12 = [], [], []
+    bbox_xyxy, scale = image_info["bboxes_xyxy"], image_info["resized_scales"]
+    for idx in keypoints:
+        # coordinate change from COLMAP to OpenCV
+        cur_keypoint = keypoints[idx] - 0.5
+        # go to the coordiante after cropping
+        # use idx - 1 here because the COLMAP format starts from 1 instead of 0
+        cur_keypoint = cur_keypoint - [
+            bbox_xyxy[idx - 1][0],
+            bbox_xyxy[idx - 1][1],
+        ]
+        cur_keypoint = cur_keypoint * scale[idx - 1]
+        keypoints[idx] = cur_keypoint
+    for (r_idx, q_idx), pair_match in matches.items():
+        if pair_match is not None:
+            kp1.append(keypoints[r_idx][pair_match[:, 0]])
+            kp2.append(keypoints[q_idx][pair_match[:, 1]])
+            i12_pair = np.array([[r_idx - 1, q_idx - 1]])
+            i12.append(np.repeat(i12_pair, len(pair_match), axis=0))
+    if kp1:
+        kp1, kp2, i12 = map(np.concatenate, (kp1, kp2, i12), (0, 0, 0))
+    else:
+        kp1 = kp2 = i12 = None
+    return kp1, kp2, i12
+def run_hloc(output_dir: str):
+    # learned from
+    # https://github.com/cvg/Hierarchical-Localization/blob/master/pipeline_SfM.ipynb
+    images = Path(output_dir)
+    outputs = Path(os.path.join(output_dir, "output"))
+    sfm_pairs = outputs / "pairs-sfm.txt"
+    sfm_dir = outputs / "sfm"
+    features = outputs / "features.h5"
+    matches = outputs / "matches.h5"
+    feature_conf = extract_features.confs[
+        "superpoint_inloc"
+    ]  # or superpoint_max
+    matcher_conf = match_features.confs["superpoint+lightglue"]
+    references = [
+        p.relative_to(images).as_posix()
+        for p in (images / "mapping/").iterdir()
+    ]
+    extract_features.main(
+        feature_conf, images, image_list=references, feature_path=features
+    )
+    pairs_from_exhaustive.main(sfm_pairs, image_list=references)
+    match_features.main(
+        matcher_conf, sfm_pairs, features=features, matches=matches
+    )
+    matches, keypoints = compute_matches_and_keypoints(
+        sfm_dir, images, sfm_pairs, features, matches, image_list=references
+    )
+    return matches, keypoints
+def compute_matches_and_keypoints(
+    sfm_dir: Path,
+    image_dir: Path,
+    pairs: Path,
+    features: Path,
+    matches: Path,
+    camera_mode: pycolmap.CameraMode = pycolmap.CameraMode.AUTO,
+    verbose: bool = False,
+    min_match_score: Optional[float] = None,
+    image_list: Optional[List[str]] = None,
+    image_options: Optional[Dict[str, Any]] = None,
+) -> pycolmap.Reconstruction:
+    # learned from
+    # https://github.com/cvg/Hierarchical-Localization/blob/master/hloc/reconstruction.py
+    sfm_dir.mkdir(parents=True, exist_ok=True)
+    database = sfm_dir / "database.db"
+    create_empty_db(database)
+    import_images(image_dir, database, camera_mode, image_list, image_options)
+    image_ids = get_image_ids(database)
+    import_features(image_ids, database, features)
+    import_matches(image_ids, database, pairs, matches, min_match_score)
+    estimation_and_geometric_verification(database, pairs, verbose)
+    db = COLMAPDatabase.connect(database)
+    matches = dict(
+        (
+            pair_id_to_image_ids(pair_id),
+            _blob_to_array_safe(data, np.uint32, (-1, 2)),
+        )
+        for pair_id, data in db.execute("SELECT pair_id, data FROM matches")
+    )
+    keypoints = dict(
+        (image_id, _blob_to_array_safe(data, np.float32, (-1, 2)))
+        for image_id, data in db.execute("SELECT image_id, data FROM keypoints")
+    )
+    db.close()
+    return matches, keypoints
+def _blob_to_array_safe(blob, dtype, shape=(-1,)):
+    if blob is not None:
+        return np.fromstring(blob, dtype=dtype).reshape(*shape)
+    else:
+        return blob

util/metric.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import random
+import numpy as np
+import torch
+def compute_ARE(rotation1, rotation2):
+    if isinstance(rotation1, torch.Tensor):
+        rotation1 = rotation1.cpu().detach().numpy()
+    if isinstance(rotation2, torch.Tensor):
+        rotation2 = rotation2.cpu().detach().numpy()
+    R_rel = np.einsum("Bij,Bjk ->Bik", rotation1.transpose(0, 2, 1), rotation2)
+    t = (np.trace(R_rel, axis1=1, axis2=2) - 1) / 2
+    theta = np.arccos(np.clip(t, -1, 1))
+    error = theta * 180 / np.pi
+    return np.minimum(error, np.abs(180 - error))

util/utils.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import random
+import numpy as np
+import torch
+import tempfile
+def seed_all_random_engines(seed: int) -> None:
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    random.seed(seed)

weights/co3d_model_Apr16.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7084b19cddce8dcc8f9197a8bbcf330fd0edf1c0c97b628c35180d8a18edbeb
+size 155952931