Spaces:

xiexh20
/

HDM-interaction-recon

Sleeping

App Files Files Community

xiexh20 commited on Mar 7

Commit

2fd6166

•

1 Parent(s): 12a785b

add hdm demo v1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +15 -12
app.py +177 -0
configs/__init__.py +0 -0
configs/structured.py +416 -0
dataset/__init__.py +301 -0
dataset/base_data.py +110 -0
dataset/behave_paths.py +228 -0
dataset/demo_dataset.py +198 -0
dataset/img_utils.py +149 -0
demo.py +280 -0
diffusion_utils.py +313 -0
examples/017450/k1.color.jpg +0 -0
examples/017450/k1.obj_rend_mask.png +0 -0
examples/017450/k1.person_mask.png +0 -0
model/__init__.py +28 -0
model/feature_model.py +160 -0
model/model.py +303 -0
model/model_coloring.py +84 -0
model/model_diff_data.py +238 -0
model/model_hoattn.py +457 -0
model/model_utils.py +58 -0
model/point_cloud_model.py +67 -0
model/point_cloud_transformer_model.py +80 -0
model/projection_model.py +273 -0
model/pvcnn/__init__.py +0 -0
model/pvcnn/modules/__init__.py +8 -0
model/pvcnn/modules/ball_query.py +69 -0
model/pvcnn/modules/frustum.py +138 -0
model/pvcnn/modules/functional/__init__.py +7 -0
model/pvcnn/modules/functional/backend.py +33 -0
model/pvcnn/modules/functional/ball_query.py +19 -0
model/pvcnn/modules/functional/devoxelization.py +42 -0
model/pvcnn/modules/functional/grouping.py +32 -0
model/pvcnn/modules/functional/interpolatation.py +38 -0
model/pvcnn/modules/functional/loss.py +17 -0
model/pvcnn/modules/functional/sampling.py +84 -0
model/pvcnn/modules/functional/src/ball_query/ball_query.cpp +30 -0
model/pvcnn/modules/functional/src/ball_query/ball_query.cu +59 -0
model/pvcnn/modules/functional/src/ball_query/ball_query.cuh +8 -0
model/pvcnn/modules/functional/src/ball_query/ball_query.hpp +10 -0
model/pvcnn/modules/functional/src/bindings.cpp +37 -0
model/pvcnn/modules/functional/src/cuda_utils.cuh +39 -0
model/pvcnn/modules/functional/src/grouping/grouping.cpp +44 -0
model/pvcnn/modules/functional/src/grouping/grouping.cu +85 -0
model/pvcnn/modules/functional/src/grouping/grouping.cuh +9 -0
model/pvcnn/modules/functional/src/grouping/grouping.hpp +10 -0
model/pvcnn/modules/functional/src/interpolate/neighbor_interpolate.cpp +65 -0
model/pvcnn/modules/functional/src/interpolate/neighbor_interpolate.cu +181 -0
model/pvcnn/modules/functional/src/interpolate/neighbor_interpolate.cuh +16 -0
model/pvcnn/modules/functional/src/interpolate/neighbor_interpolate.hpp +16 -0

README.md CHANGED Viewed

@@ -1,13 +1,16 @@
----
-title: HDM Interaction Recon
-emoji: 🌍
-colorFrom: yellow
-colorTo: green
-sdk: gradio
-sdk_version: 4.20.1
-app_file: app.py
-pinned: false
-license: cc-by-nc-4.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# HDM
+Official implementation for Hierarachical Diffusion Model in CVPR24 Template free reconstruction of human object interaction
+[Project Page](https://virtualhumans.mpi-inf.mpg.de/procigen-hdm/)|[Code](https://github.com/xiexh20/HDM)|[Dataset](https://edmond.mpg.de/dataset.xhtml?persistentId=doi:10.17617/3.2VUEUS )|[Paper](https://virtualhumans.mpi-inf.mpg.de/procigen-hdm/paper-lowreso.pdf)
+## Citation
+```
+@inproceedings{xie2023template_free,
+    title = {Template Free Reconstruction of Human-object Interaction with Procedural Interaction Generation},
+    author = {Xie, Xianghui and Bhatnagar, Bharat Lal and Lenssen, Jan Eric and Pons-Moll, Gerard},
+    booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month = {June},
+    year = {2024},
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+Demo built with gradio
+"""
+import pickle as pkl
+import sys, os
+import os.path as osp
+from typing import Iterable, Optional
+from functools import partial
+import trimesh
+from torch.utils.data import DataLoader
+import cv2
+from accelerate import Accelerator
+from tqdm import tqdm
+from glob import glob
+sys.path.append(os.getcwd())
+import hydra
+import torch
+import numpy as np
+import imageio
+import gradio as gr
+import plotly.graph_objs as go
+import training_utils
+from configs.structured import ProjectConfig
+from demo import DemoRunner
+from dataset.demo_dataset import DemoDataset
+md_description="""
+# HDM Interaction Reconstruction Demo
+### Official Implementation of the paper \"Template Free Reconstruction of Human Object Interaction\", CVPR'24.
+[Project Page](https://virtualhumans.mpi-inf.mpg.de/procigen-hdm/)|[Code](https://github.com/xiexh20/HDM)|[Dataset](https://edmond.mpg.de/dataset.xhtml?persistentId=doi:10.17617/3.2VUEUS )|[Paper](https://virtualhumans.mpi-inf.mpg.de/procigen-hdm/paper-lowreso.pdf)
+Upload your own human object interaction image and get full 3D reconstruction!
+## Citation
+```
+@inproceedings{xie2023template_free,
+    title = {Template Free Reconstruction of Human-object Interaction with Procedural Interaction Generation},
+    author = {Xie, Xianghui and Bhatnagar, Bharat Lal and Lenssen, Jan Eric and Pons-Moll, Gerard},
+    booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month = {June},
+    year = {2024},
+}
+```
+"""
+def plot_points(colors, coords):
+    """
+    use plotly to visualize 3D point with colors
+    """
+    trace = go.Scatter3d(x=coords[:, 0], y=coords[:, 1], z=coords[:, 2], mode='markers',
+                         marker=dict(
+                             size=2,
+                             color=colors
+                         ))
+    layout = go.Layout(
+        scene=dict(
+            xaxis=dict(
+                title="",
+                showgrid=False,
+                zeroline=False,
+                showline=False,
+                ticks='',
+                showticklabels=False
+            ),
+            yaxis=dict(
+                title="",
+                showgrid=False,
+                zeroline=False,
+                showline=False,
+                ticks='',
+                showticklabels=False
+            ),
+            zaxis=dict(
+                title="",
+                showgrid=False,
+                zeroline=False,
+                showline=False,
+                ticks='',
+                showticklabels=False
+            ),
+        ),
+        margin=dict(l=0, r=0, b=0, t=0),
+        showlegend=False
+    )
+    fig = go.Figure(data=[trace], layout=layout)
+    return fig
+def inference(runner: DemoRunner, cfg: ProjectConfig, rgb, mask_hum, mask_obj, std_coverage, input_seed):
+    """
+    given user input, run inference
+    :param runner:
+    :param cfg:
+    :param rgb: (h, w, 3), np array
+    :param mask_hum: (h, w, 3), np array
+    :param mask_obj: (h, w, 3), np array
+    :param std_coverage: float value, used to estimate camera translation
+    :param input_seed: random seed
+    :return: path to the 3D reconstruction, and an interactive 3D figure for visualizing the point cloud
+    """
+    # Set random seed
+    training_utils.set_seed(int(input_seed))
+    data = DemoDataset([], (cfg.dataset.image_size, cfg.dataset.image_size),
+                           std_coverage)
+    batch = data.image2batch(rgb, mask_hum, mask_obj)
+    out_stage1, out_stage2 = runner.forward_batch(batch, cfg)
+    points = out_stage2.points_packed().cpu().numpy()
+    colors = out_stage2.features_packed().cpu().numpy()
+    fig = plot_points(colors, points)
+    # save tmp point cloud
+    outdir = './results'
+    os.makedirs(outdir, exist_ok=True)
+    trimesh.PointCloud(points, colors).export(outdir + f"/pred_std{std_coverage}_seed{input_seed}_stage2.ply")
+    trimesh.PointCloud(out_stage1.points_packed().cpu().numpy(),
+                       out_stage1.features_packed().cpu().numpy()).export(outdir + f"/pred_std{std_coverage}_seed{input_seed}_stage1.ply")
+    return fig, outdir + f"/pred_std{std_coverage}_seed{input_seed}_stage2.ply"
+@hydra.main(config_path='configs', config_name='configs', version_base='1.1')
+def main(cfg: ProjectConfig):
+    # Setup model
+    runner = DemoRunner(cfg)
+    # Setup interface
+    demo = gr.Blocks(title="HDM Interaction Reconstruction Demo")
+    with demo:
+        gr.Markdown(md_description)
+        gr.HTML("""<h1 style="text-align:center; color:#10768c">HDM Demo</h1>""")
+        gr.HTML("""<h3 style="text-align:center; color:#10768c">Instruction: Upload RGB, human, object masks and then click reconstruct.</h1>""")
+        # Input data
+        with gr.Row():
+            input_rgb = gr.Image(label='Input RGB', type='numpy')
+            input_mask_hum = gr.Image(label='Human mask', type='numpy')
+        with gr.Row():
+            input_mask_obj = gr.Image(label='Object mask', type='numpy')
+            with gr.Column():
+                # TODO: add hint for this value here
+                input_std = gr.Number(label='Gaussian std coverage', value=3.5)
+                input_seed = gr.Number(label='Random seed', value=42)
+        # Output visualization
+        with gr.Row():
+            pc_plot = gr.Plot(label="Reconstructed point cloud")
+            out_pc_download = gr.File(label="3D reconstruction for download") # this allows downloading
+        gr.HTML("""<br/>""")
+        # Control
+        with gr.Row():
+            button_recon = gr.Button("Start Reconstruction", interactive=True, variant='secondary')
+            button_recon.click(fn=partial(inference, runner, cfg),
+                               inputs=[input_rgb, input_mask_hum, input_mask_obj, input_std, input_seed],
+                               outputs=[pc_plot, out_pc_download])
+        gr.HTML("""<br/>""")
+        # Example input
+        example_dir = cfg.run.code_dir_abs+"/examples"
+        rgb, ps, obj = 'k1.color.jpg', 'k1.person_mask.png', 'k1.obj_rend_mask.png'
+        example_images = gr.Examples([
+            [f"{example_dir}/017450/{rgb}", f"{example_dir}/017450/{ps}", f"{example_dir}/017450/{obj}", 3.0, 42],
+            [f"{example_dir}/002446/{rgb}", f"{example_dir}/002446/{ps}", f"{example_dir}/002446/{obj}", 3.0, 42],
+            [f"{example_dir}/053431/{rgb}", f"{example_dir}/053431/{ps}", f"{example_dir}/053431/{obj}", 3.8, 42],
+            [f"{example_dir}/158107/{rgb}", f"{example_dir}/158107/{ps}", f"{example_dir}/158107/{obj}", 3.8, 42],
+        ], inputs=[input_rgb, input_mask_hum, input_mask_obj, input_std, input_seed],)
+    # demo.launch(share=True)
+    # Enabling queue for runtime>60s, see: https://github.com/tloen/alpaca-lora/issues/60#issuecomment-1510006062
+    demo.queue(concurrency_count=3).launch(share=True)
+if __name__ == '__main__':
+    main()

configs/__init__.py ADDED Viewed

File without changes

configs/structured.py ADDED Viewed

	@@ -0,0 +1,416 @@

+import os
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Iterable
+import os.path as osp
+from hydra.core.config_store import ConfigStore
+from hydra.conf import RunDir
+@dataclass
+class CustomHydraRunDir(RunDir):
+    dir: str = './outputs/${run.name}/single'
+@dataclass
+class RunConfig:
+    name: str = 'debug'
+    job: str = 'train'
+    mixed_precision: str = 'fp16'  # 'no'
+    cpu: bool = False
+    seed: int = 42
+    val_before_training: bool = True
+    vis_before_training: bool = True
+    limit_train_batches: Optional[int] = None
+    limit_val_batches: Optional[int] = None
+    max_steps: int = 100_000
+    checkpoint_freq: int = 1_000
+    val_freq: int = 5_000
+    vis_freq: int = 5_000
+    # vis_freq: int = 10_000
+    log_step_freq: int = 20
+    print_step_freq: int = 100
+    # config to run demo
+    stage1_name: str = 'stage1'     # experiment name to the stage 1 model
+    stage2_name: str = 'stage2'     # experiment name to the stage 2 model
+    image_path: str = ''            # the path to the images for running demo, can be a single file or a glob pattern
+    # abs path to working dir
+    code_dir_abs: str = osp.dirname(osp.dirname(osp.abspath(__file__)))
+    # Inference configs
+    num_inference_steps: int = 1000
+    diffusion_scheduler: Optional[str] = 'ddpm'
+    num_samples: int = 1
+    # num_sample_batches: Optional[int] = None
+    num_sample_batches: Optional[int] = 2000  # XH: change to 2
+    sample_from_ema: bool = False
+    sample_save_evolutions: bool = False  # temporarily set by default
+    save_name: str = 'sample'  # XH: additional save name
+    redo: bool = False
+    # for parallel sampling in slurm
+    batch_start: int = 0
+    batch_end: Optional[int] = None
+    # Training configs
+    freeze_feature_model: bool = True
+    # Coloring training configs
+    coloring_training_noise_std: float = 0.0
+    coloring_sample_dir: Optional[str] = None
+    sample_mode: str = 'sample'  # whether from noise or from some intermediate steps
+    sample_noise_step: int = 500  # add noise to GT up to some steps, and then denoise
+    sample_save_gt: bool = True
+@dataclass
+class LoggingConfig:
+    wandb: bool = True
+    wandb_project: str = 'pc2'
+@dataclass
+class PointCloudProjectionModelConfig:
+    # Feature extraction arguments
+    image_size: int = '${dataset.image_size}'
+    image_feature_model: str = 'vit_base_patch16_224_mae' # or 'vit_small_patch16_224_msn' or 'identity'
+    use_local_colors: bool = True
+    use_local_features: bool = True
+    use_global_features: bool = False
+    use_mask: bool = True
+    use_distance_transform: bool = True
+    # Point cloud data arguments. Note these are here because the processing happens
+    # inside the model, rather than inside the dataset.
+    scale_factor: float = "${dataset.scale_factor}"
+    colors_mean: float = 0.5
+    colors_std: float = 0.5
+    color_channels: int = 3
+    predict_shape: bool = True
+    predict_color: bool = False
+    # added by XH
+    load_sample_init: bool = False  # load init samples from file
+    sample_init_scale: float = 1.0  # scale the initial pc samples
+    test_init_with_gtpc: bool = False  # test time init samples with GT samples
+    consistent_center: bool = True  # use consistent center prediction by CCD-3DR
+    voxel_resolution_multiplier: float = 1  # increase network voxel resolution
+    # predict binary segmentation
+    predict_binary: bool = False # True for stage 1 model, False for others
+    lw_binary: float = 3.0  # to have roughly the same magnitude of the binary segmentation loss
+    # for separate model
+    binary_training_noise_std: float = 0.1  # from github doc for predicting color
+    self_conditioning: bool = False
+@dataclass
+class PVCNNAEModelConfig(PointCloudProjectionModelConfig):
+    "my own model config, must inherit parent class"
+    model_name: str = 'pvcnn-ae'
+    latent_dim: int = 1024
+    num_dec_blocks: int = 6
+    block_dims: List[int] = field(default_factory=lambda: [512, 256])
+    num_points: int = 1500
+    bottleneck_dim: int = -1 # the input dim to the last MLP layer
+@dataclass
+class PointCloudDiffusionModelConfig(PointCloudProjectionModelConfig):
+    model_name: str = 'pc2-diff-ho'  # default as behave
+    # Diffusion arguments
+    beta_start: float = 1e-5  # 0.00085
+    beta_end: float = 8e-3  # 0.012
+    beta_schedule: str = 'linear'  # 'custom'
+    dm_pred_type: str = 'epsilon'  # diffusion model prediction type, sample (x0) or noise
+    # Point cloud model arguments
+    point_cloud_model: str = 'pvcnn'
+    point_cloud_model_embed_dim: int = 64
+    dataset_type: str = '${dataset.type}'
+@dataclass
+class CrossAttnHOModelConfig(PointCloudDiffusionModelConfig):
+    model_name: str = 'diff-ho-attn'
+    attn_type: str = 'coord3d+posenc-learnable'
+    attn_weight: float = 1.0
+    point_visible_test: str = 'combine'  # To compute point visibility: use all points or only human/object points
+@dataclass
+class DirectTransModelConfig(PointCloudProjectionModelConfig):
+    model_name: str = 'direct-transl-ho'
+    pooling: str = "avg"
+    act: str = 'gelu'
+    out_act: str = 'relu'
+    # feat_dims_transl: Iterable[Any] = (384, 256, 128, 6) # cannot use List[int] https://github.com/facebookresearch/hydra/issues/1752#issuecomment-893174197
+    # feat_dims_scale: Iterable[Any] = (384, 128, 64, 2)
+    feat_dims_transl: List[int] = field(default_factory=lambda: [384, 256, 128, 6])
+    feat_dims_scale: List[int] = field(default_factory=lambda: [384, 128, 64, 2])
+    lw_transl: float = 10000.0
+    lw_scale: float = 10000.0
+@dataclass
+class PointCloudColoringModelConfig(PointCloudProjectionModelConfig):
+    # Projection arguments
+    predict_shape: bool = False
+    predict_color: bool = True
+    # Point cloud model arguments
+    point_cloud_model: str = 'pvcnn'
+    point_cloud_model_layers: int = 1
+    point_cloud_model_embed_dim: int = 64
+@dataclass
+class DatasetConfig:
+    type: str
+@dataclass
+class PointCloudDatasetConfig(DatasetConfig):
+    eval_split: str = 'val'
+    max_points: int = 16_384
+    image_size: int = 224
+    scale_factor: float = 1.0
+    restrict_model_ids: Optional[List] = None  # for only running on a subset of data points
+@dataclass
+class CO3DConfig(PointCloudDatasetConfig):
+    type: str = 'co3dv2'
+    # root: str = os.getenv('CO3DV2_DATASET_ROOT')
+    root: str = "/BS/xxie-2/work/co3d/hydrant"
+    category: str = 'hydrant'
+    subset_name: str = 'fewview_dev'
+    mask_images: bool = '${model.use_mask}'
+@dataclass
+class ShapeNetR2N2Config(PointCloudDatasetConfig):
+    # added by XH
+    fix_sample: bool = True
+    category: str = 'chair'
+    type: str = 'shapenet_r2n2'
+    root: str = "/BS/chiban2/work/data_shapenet/ShapeNetCore.v1"
+    r2n2_dir: str = "/BS/databases20/3d-r2n2"
+    shapenet_dir: str = "/BS/chiban2/work/data_shapenet/ShapeNetCore.v1"
+    preprocessed_r2n2_dir: str = "${dataset.root}/r2n2_preprocessed_renders"
+    splits_file: str = "${dataset.root}/r2n2_standard_splits_from_ShapeNet_taxonomy.json"
+    # splits_file: str = "${dataset.root}/pix2mesh_splits_val05.json"  # <-- incorrect
+    scale_factor: float = 7.0
+    point_cloud_filename: str = 'pointcloud_r2n2.npz'  # should use 'pointcloud_mesh.npz'
+@dataclass
+class BehaveDatasetConfig(PointCloudDatasetConfig):
+    # added by XH
+    type: str = 'behave'
+    fix_sample: bool = True
+    behave_dir: str = "/BS/xxie-5/static00/behave_release/sequences/"
+    split_file: str = "" # specify you dataset split file here
+    scale_factor: float = 7.0  # use the same as shapenet
+    sample_ratio_hum: float = 0.5
+    image_size: int = 224
+    normalize_type: str = 'comb'
+    smpl_type: str = 'gt'  # use which SMPL mesh to obtain normalization parameters
+    test_transl_type: str = 'norm'
+    load_corr_points: bool = False  # load autoencoder points for object and SMPL
+    uniform_obj_sample: bool = False
+    # configs for direct translation prediction
+    bkg_type: str = 'none'
+    bbox_params: str = 'none'
+    ho_segm_pred_path: Optional[str] = None
+    use_gt_transl: bool = False
+    cam_noise_std: float = 0. # add noise to the camera pose
+    sep_same_crop: bool = False # use same input image crop to separate models
+    aug_blur: float = 0. # blur augmentation
+    std_coverage: float=3.5 # a heuristic value to estimate translation
+    v2v_path: str = '' # object v2v corr path
+@dataclass
+class ShapeDatasetConfig(BehaveDatasetConfig):
+    "the dataset to train AE for aligned shapes"
+    type: str = 'shape'
+    fix_sample: bool = False
+    split_file: str = "/BS/xxie-2/work/pc2-diff/experiments/splits/shapes-chair.pkl"
+# TODO
+@dataclass
+class ShapeNetNMRConfig(PointCloudDatasetConfig):
+    type: str = 'shapenet_nmr'
+    shapenet_nmr_dir: str = "/work/lukemk/machine-learning-datasets/3d-reconstruction/ShapeNet_NMR/NMR_Dataset"
+    synset_names: str = 'chair'  # comma-separated or 'all'
+    augmentation: str = 'all'
+    scale_factor: float = 7.0
+@dataclass
+class AugmentationConfig:
+    # need to specify the variable type in order to define it properly
+    max_radius: int = 0  # generate a random square to mask object, this is the radius for the square in pixel size, zero means no occlusion
+@dataclass
+class DataloaderConfig:
+    # batch_size: int = 8  # 2 for debug
+    batch_size: int = 16
+    num_workers: int = 14  # 0 for debug # suggested by accelerator for gpu20
+@dataclass
+class LossConfig:
+    diffusion_weight: float = 1.0
+    rgb_weight: float = 1.0
+    consistency_weight: float = 1.0
+@dataclass
+class CheckpointConfig:
+    resume: Optional[str] = "test"
+    resume_training: bool = True
+    resume_training_optimizer: bool = True
+    resume_training_scheduler: bool = True
+    resume_training_state: bool = True
+@dataclass
+class ExponentialMovingAverageConfig:
+    use_ema: bool = False
+    # # From Diffusers EMA (should probably switch)
+    # ema_inv_gamma: float = 1.0
+    # ema_power: float = 0.75
+    # ema_max_decay: float = 0.9999
+    decay: float = 0.999
+    update_every: int = 20
+@dataclass
+class OptimizerConfig:
+    type: str
+    name: str
+    lr: float = 3e-4
+    weight_decay: float = 0.0
+    scale_learning_rate_with_batch_size: bool = False
+    gradient_accumulation_steps: int = 1
+    clip_grad_norm: Optional[float] = 50.0  # 5.0
+    kwargs: Dict = field(default_factory=lambda: dict())
+@dataclass
+class AdadeltaOptimizerConfig(OptimizerConfig):
+    type: str = 'torch'
+    name: str = 'Adadelta'
+    kwargs: Dict = field(default_factory=lambda: dict(
+        weight_decay=1e-6,
+    ))
+@dataclass
+class AdamOptimizerConfig(OptimizerConfig):
+    type: str = 'torch'
+    name: str = 'AdamW'
+    weight_decay: float = 1e-6
+    kwargs: Dict = field(default_factory=lambda: dict(betas=(0.95, 0.999)))
+@dataclass
+class SchedulerConfig:
+    type: str
+    kwargs: Dict = field(default_factory=lambda: dict())
+@dataclass
+class LinearSchedulerConfig(SchedulerConfig):
+    type: str = 'transformers'
+    kwargs: Dict = field(default_factory=lambda: dict(
+        name='linear',
+        num_warmup_steps=0,
+        num_training_steps="${run.max_steps}",
+    ))
+@dataclass
+class CosineSchedulerConfig(SchedulerConfig):
+    type: str = 'transformers'
+    kwargs: Dict = field(default_factory=lambda: dict(
+        name='cosine',
+        num_warmup_steps=2000,  # 0
+        num_training_steps="${run.max_steps}",
+    ))
+@dataclass
+class ProjectConfig:
+    run: RunConfig
+    logging: LoggingConfig
+    dataset: PointCloudDatasetConfig
+    augmentations: AugmentationConfig
+    dataloader: DataloaderConfig
+    loss: LossConfig
+    model: PointCloudProjectionModelConfig
+    ema: ExponentialMovingAverageConfig
+    checkpoint: CheckpointConfig
+    optimizer: OptimizerConfig
+    scheduler: SchedulerConfig
+    defaults: List[Any] = field(default_factory=lambda: [
+        'custom_hydra_run_dir',
+        {'run': 'default'},
+        {'logging': 'default'},
+        {'model': 'ho-attn'},
+        # {'dataset': 'co3d'},
+        {'dataset': 'behave'},
+        {'augmentations': 'default'},
+        {'dataloader': 'default'},
+        {'ema': 'default'},
+        {'loss': 'default'},
+        {'checkpoint': 'default'},
+        {'optimizer': 'adam'}, # default adamw
+        {'scheduler': 'linear'},
+        # {'scheduler': 'cosine'},
+    ])
+cs = ConfigStore.instance()
+cs.store(name='custom_hydra_run_dir', node=CustomHydraRunDir, package="hydra.run")
+cs.store(group='run', name='default', node=RunConfig)
+cs.store(group='logging', name='default', node=LoggingConfig)
+cs.store(group='model', name='diffrec', node=PointCloudDiffusionModelConfig)
+cs.store(group='model', name='coloring_model', node=PointCloudColoringModelConfig)
+cs.store(group='model', name='direct-transl', node=DirectTransModelConfig)
+cs.store(group='model', name='ho-attn', node=CrossAttnHOModelConfig)
+cs.store(group='model', name='pvcnn-ae', node=PVCNNAEModelConfig)
+cs.store(group='dataset', name='co3d', node=CO3DConfig)
+# TODO
+cs.store(group='dataset', name='shapenet_r2n2', node=ShapeNetR2N2Config)
+cs.store(group='dataset', name='behave', node=BehaveDatasetConfig)
+cs.store(group='dataset', name='shape', node=ShapeDatasetConfig)
+# cs.store(group='dataset', name='shapenet_nmr', node=ShapeNetNMRConfig)
+cs.store(group='augmentations', name='default', node=AugmentationConfig)
+cs.store(group='dataloader', name='default', node=DataloaderConfig)
+cs.store(group='loss', name='default', node=LossConfig)
+cs.store(group='ema', name='default', node=ExponentialMovingAverageConfig)
+cs.store(group='checkpoint', name='default', node=CheckpointConfig)
+cs.store(group='optimizer', name='adadelta', node=AdadeltaOptimizerConfig)
+cs.store(group='optimizer', name='adam', node=AdamOptimizerConfig)
+cs.store(group='scheduler', name='linear', node=LinearSchedulerConfig)
+cs.store(group='scheduler', name='cosine', node=CosineSchedulerConfig)
+cs.store(name='configs', node=ProjectConfig)

dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,301 @@

+from pathlib import Path
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+import numpy as np
+import pytorch3d
+import torch
+from torch.utils.data import SequentialSampler
+from omegaconf import DictConfig
+from pytorch3d.implicitron.dataset.data_loader_map_provider import \
+    SequenceDataLoaderMapProvider
+from pytorch3d.implicitron.dataset.dataset_base import FrameData
+from pytorch3d.implicitron.dataset.json_index_dataset import JsonIndexDataset
+from pytorch3d.implicitron.dataset.json_index_dataset_map_provider_v2 import (
+    JsonIndexDatasetMapProviderV2, registry)
+from pytorch3d.implicitron.tools.config import expand_args_fields
+from pytorch3d.renderer.cameras import CamerasBase
+from torch.utils.data import DataLoader
+from configs.structured import CO3DConfig, DataloaderConfig, ProjectConfig, Optional
+from .exclude_sequence import EXCLUDE_SEQUENCE, LOW_QUALITY_SEQUENCE
+from .utils import DatasetMap
+from .r2n2_my import R2N2Sample, collate_batched_meshes
+def get_dataset(cfg: ProjectConfig):
+    if cfg.dataset.type == 'co3dv2':
+        dataset_cfg: CO3DConfig = cfg.dataset
+        dataloader_cfg: DataloaderConfig = cfg.dataloader
+        # Exclude bad and low-quality sequences, XH: why this is needed?
+        exclude_sequence = []
+        exclude_sequence.extend(EXCLUDE_SEQUENCE.get(dataset_cfg.category, []))
+        exclude_sequence.extend(LOW_QUALITY_SEQUENCE.get(dataset_cfg.category, []))
+        # Whether to load pointclouds
+        kwargs = dict(
+            remove_empty_masks=True,
+            n_frames_per_sequence=1,
+            load_point_clouds=True,
+            max_points=dataset_cfg.max_points,
+            image_height=dataset_cfg.image_size,
+            image_width=dataset_cfg.image_size,
+            mask_images=dataset_cfg.mask_images,
+            exclude_sequence=exclude_sequence,
+            pick_sequence=() if dataset_cfg.restrict_model_ids is None else dataset_cfg.restrict_model_ids,
+        )
+        # Get dataset mapper
+        dataset_map_provider_type = registry.get(JsonIndexDatasetMapProviderV2, "JsonIndexDatasetMapProviderV2")
+        expand_args_fields(dataset_map_provider_type)
+        dataset_map_provider = dataset_map_provider_type(
+            category=dataset_cfg.category,
+            subset_name=dataset_cfg.subset_name,
+            dataset_root=dataset_cfg.root,
+            test_on_train=False,
+            only_test_set=False,
+            load_eval_batches=True,
+            dataset_JsonIndexDataset_args=DictConfig(kwargs),
+        )
+        # Get datasets
+        datasets = dataset_map_provider.get_dataset_map() # how to select specific frames??
+        # PATCH BUG WITH POINT CLOUD LOCATIONS!
+        for dataset in (datasets["train"], datasets["val"]):
+            # print(dataset.seq_annots.items())
+            for key, ann in dataset.seq_annots.items():
+                correct_point_cloud_path = Path(dataset.dataset_root) / Path(*Path(ann.point_cloud.path).parts[-3:])
+                assert correct_point_cloud_path.is_file(), correct_point_cloud_path
+                ann.point_cloud.path = str(correct_point_cloud_path)
+        # Get dataloader mapper
+        data_loader_map_provider_type = registry.get(SequenceDataLoaderMapProvider, "SequenceDataLoaderMapProvider")
+        expand_args_fields(data_loader_map_provider_type)
+        data_loader_map_provider = data_loader_map_provider_type(
+            batch_size=dataloader_cfg.batch_size,
+            num_workers=dataloader_cfg.num_workers,
+        )
+        # QUICK HACK: Patch the train dataset because it is not used but it throws an error
+        if (len(datasets['train']) == 0 and len(datasets[dataset_cfg.eval_split]) > 0 and
+                dataset_cfg.restrict_model_ids is not None and cfg.run.job == 'sample'):
+            datasets = DatasetMap(train=datasets[dataset_cfg.eval_split], val=datasets[dataset_cfg.eval_split],
+                                  test=datasets[dataset_cfg.eval_split])
+            # XH: why all eval split?
+            print('Note: You used restrict_model_ids and there were no ids in the train set.')
+        # Get dataloaders
+        dataloaders = data_loader_map_provider.get_data_loader_map(datasets)
+        dataloader_train = dataloaders['train']
+        dataloader_val = dataloader_vis = dataloaders[dataset_cfg.eval_split]
+        # Replace validation dataloader sampler with SequentialSampler
+        # seems to be randomly sampled? with a fixed random seed? but one cannot control which image is being sampled??
+        dataloader_val.batch_sampler.sampler = SequentialSampler(dataloader_val.batch_sampler.sampler.data_source)
+        # Modify for accelerate
+        dataloader_train.batch_sampler.drop_last = True
+        dataloader_val.batch_sampler.drop_last = False
+    elif cfg.dataset.type == 'shapenet_r2n2':
+        # from ..configs.structured import ShapeNetR2N2Config
+        dataset_cfg: ShapeNetR2N2Config = cfg.dataset
+        # for k in dataset_cfg:
+        #     print(k)
+        datasets = [R2N2Sample(dataset_cfg.max_points, dataset_cfg.fix_sample,
+                               dataset_cfg.image_size, cfg.augmentations,
+                               s, dataset_cfg.shapenet_dir,
+                               dataset_cfg.r2n2_dir, dataset_cfg.splits_file,
+                               load_textures=False, return_all_views=True) for s in ['train', 'val', 'test']]
+        dataloader_train = DataLoader(datasets[0], batch_size=cfg.dataloader.batch_size,
+                                      collate_fn=collate_batched_meshes,
+                                      num_workers=cfg.dataloader.num_workers, shuffle=True)
+        dataloader_val = DataLoader(datasets[1], batch_size=cfg.dataloader.batch_size,
+                                      collate_fn=collate_batched_meshes,
+                                    num_workers=cfg.dataloader.num_workers, shuffle=False)
+        dataloader_vis = DataLoader(datasets[2], batch_size=cfg.dataloader.batch_size,
+                                      collate_fn=collate_batched_meshes,
+                                    num_workers=cfg.dataloader.num_workers, shuffle=False)
+    elif cfg.dataset.type in ['behave', 'behave-objonly', 'behave-humonly', 'behave-dtransl',
+                              'behave-objonly-segm', 'behave-humonly-segm', 'behave-attn',
+                              'behave-test', 'behave-attn-test', 'behave-hum-pe', 'behave-hum-noscale',
+                              'behave-hum-surf', 'behave-objv2v']:
+        from .behave_dataset import BehaveDataset, NTUDataset, BehaveObjOnly, BehaveHumanOnly, BehaveHumanOnlyPosEnc
+        from .behave_dataset import BehaveHumanOnlySegmInput, BehaveObjOnlySegmInput, BehaveTestOnly, BehaveHumNoscale
+        from .behave_dataset import BehaveHumanOnlySurfSample
+        from .dtransl_dataset import DirectTranslDataset
+        from .behave_paths import DataPaths
+        from configs.structured import BehaveDatasetConfig
+        from .behave_crossattn import BehaveCrossAttnDataset, BehaveCrossAttnTest
+        from .behave_dataset import BehaveObjOnlyV2V
+        dataset_cfg: BehaveDatasetConfig = cfg.dataset
+        # print(dataset_cfg.behave_dir)
+        train_paths, val_paths = DataPaths.load_splits(dataset_cfg.split_file, dataset_cfg.behave_dir)
+        # exit(0)
+        # split validation paths to only consider the selected batches
+        bs = cfg.dataloader.batch_size
+        num_batches_total = int(np.ceil(len(val_paths)/cfg.dataloader.batch_size))
+        end_idx = cfg.run.batch_end if cfg.run.batch_end is not None else num_batches_total
+        # print(cfg.run.batch_end, cfg.run.batch_start, end_idx)
+        val_paths = val_paths[cfg.run.batch_start*bs:end_idx*bs]
+        if cfg.dataset.type == 'behave':
+            train_type = BehaveDataset
+            val_datatype = BehaveDataset if 'ntu' not in dataset_cfg.split_file else NTUDataset
+        elif cfg.dataset.type == 'behave-test':
+            train_type = BehaveDataset
+            val_datatype = BehaveTestOnly
+        elif cfg.dataset.type == 'behave-objonly':
+            train_type = BehaveObjOnly
+            val_datatype = BehaveObjOnly
+            assert 'ntu' not in dataset_cfg.split_file, 'ntu not implemented!'
+        elif cfg.dataset.type == 'behave-humonly':
+            train_type = BehaveHumanOnly
+            val_datatype = BehaveHumanOnly
+            assert 'ntu' not in dataset_cfg.split_file, 'ntu not implemented!'
+        elif cfg.dataset.type == 'behave-hum-noscale':
+            train_type = BehaveHumNoscale
+            val_datatype = BehaveHumNoscale
+        elif cfg.dataset.type == 'behave-hum-pe':
+            train_type = BehaveHumanOnlyPosEnc
+            val_datatype = BehaveHumanOnlyPosEnc
+        elif cfg.dataset.type == 'behave-hum-surf':
+            train_type = BehaveHumanOnlySurfSample
+            val_datatype = BehaveHumanOnlySurfSample
+        elif cfg.dataset.type == 'behave-humonly-segm':
+            assert cfg.dataset.ho_segm_pred_path is not None, 'please specify predicted HO segmentation!'
+            train_type = BehaveHumanOnly
+            val_datatype = BehaveHumanOnlySegmInput
+            assert 'ntu' not in dataset_cfg.split_file, 'ntu not implemented!'
+        elif cfg.dataset.type == 'behave-objonly-segm':
+            assert cfg.dataset.ho_segm_pred_path is not None, 'please specify predicted HO segmentation!'
+            train_type = BehaveObjOnly
+            val_datatype = BehaveObjOnlySegmInput
+            assert 'ntu' not in dataset_cfg.split_file, 'ntu not implemented!'
+        elif cfg.dataset.type == 'behave-dtransl':
+            train_type = DirectTranslDataset
+            val_datatype = DirectTranslDataset
+        elif cfg.dataset.type == 'behave-attn':
+            train_type = BehaveCrossAttnDataset
+            val_datatype = BehaveCrossAttnDataset
+        elif cfg.dataset.type == 'behave-attn-test':
+            train_type = BehaveCrossAttnDataset
+            val_datatype = BehaveCrossAttnTest
+        elif cfg.dataset.type == 'behave-objv2v':
+            train_type = BehaveObjOnlyV2V
+            val_datatype = BehaveObjOnlyV2V
+        else:
+            raise NotImplementedError
+        dataset_train = train_type(train_paths, dataset_cfg.max_points, dataset_cfg.fix_sample,
+                                   (dataset_cfg.image_size, dataset_cfg.image_size),
+                                   split='train', sample_ratio_hum=dataset_cfg.sample_ratio_hum,
+                                  normalize_type=dataset_cfg.normalize_type, smpl_type='gt',
+                                     load_corr_points=dataset_cfg.load_corr_points,
+                                     uniform_obj_sample=dataset_cfg.uniform_obj_sample,
+                                  bkg_type=dataset_cfg.bkg_type,
+                                  bbox_params=dataset_cfg.bbox_params,
+                                  pred_binary=cfg.model.predict_binary,
+                                  ho_segm_pred_path=cfg.dataset.ho_segm_pred_path,
+                                  compute_closest_points=cfg.model.model_name=='pc2-diff-ho-tune-newloss',
+                                  use_gt_transl=cfg.dataset.use_gt_transl,
+                                  cam_noise_std=cfg.dataset.cam_noise_std,
+                                  sep_same_crop=cfg.dataset.sep_same_crop,
+                                  aug_blur=cfg.dataset.aug_blur,
+                                  std_coverage=cfg.dataset.std_coverage,
+                                   v2v_path=cfg.dataset.v2v_path)
+        dataset_val = val_datatype(val_paths, dataset_cfg.max_points, dataset_cfg.fix_sample,
+                                      (dataset_cfg.image_size, dataset_cfg.image_size),
+                                      split='val', sample_ratio_hum=dataset_cfg.sample_ratio_hum,
+                                      normalize_type=dataset_cfg.normalize_type, smpl_type=dataset_cfg.smpl_type,
+                                    load_corr_points=dataset_cfg.load_corr_points,
+                                   test_transl_type=dataset_cfg.test_transl_type,
+                                   uniform_obj_sample=dataset_cfg.uniform_obj_sample,
+                                   bkg_type=dataset_cfg.bkg_type,
+                                  bbox_params=dataset_cfg.bbox_params,
+                                   pred_binary=cfg.model.predict_binary,
+                                   ho_segm_pred_path=cfg.dataset.ho_segm_pred_path,
+                                   compute_closest_points=cfg.model.model_name=='pc2-diff-ho-tune-newloss',
+                                   use_gt_transl=cfg.dataset.use_gt_transl,
+                                   sep_same_crop=cfg.dataset.sep_same_crop,
+                                   std_coverage=cfg.dataset.std_coverage,
+                                   v2v_path=cfg.dataset.v2v_path)
+        # dataset_test = val_datatype(val_paths, dataset_cfg.max_points, dataset_cfg.fix_sample,
+        #                             (dataset_cfg.image_size, dataset_cfg.image_size),
+        #                             split='test', sample_ratio_hum=dataset_cfg.sample_ratio_hum,
+        #                             normalize_type=dataset_cfg.normalize_type, smpl_type=dataset_cfg.smpl_type,
+        #                              load_corr_points=dataset_cfg.load_corr_points,
+        #                             test_transl_type=dataset_cfg.test_transl_type,
+        #                             uniform_obj_sample=dataset_cfg.uniform_obj_sample,
+        #                             bkg_type=dataset_cfg.bkg_type,
+        #                           bbox_params=dataset_cfg.bbox_params,
+        #                             pred_binary=cfg.model.predict_binary,
+        #                             ho_segm_pred_path=cfg.dataset.ho_segm_pred_path,
+        #                             compute_closest_points=cfg.model.model_name=='pc2-diff-ho-tune-newloss',
+        #                             use_gt_transl=cfg.dataset.use_gt_transl,
+        #                             sep_same_crop=cfg.dataset.sep_same_crop)
+        dataloader_train = DataLoader(dataset_train, batch_size=cfg.dataloader.batch_size,
+                                      collate_fn=collate_batched_meshes,
+                                      num_workers=cfg.dataloader.num_workers, shuffle=True)
+        shuffle = cfg.run.job == 'train'
+        dataloader_val = DataLoader(dataset_val, batch_size=cfg.dataloader.batch_size,
+                                    collate_fn=collate_batched_meshes,
+                                    num_workers=cfg.dataloader.num_workers, shuffle=shuffle)
+        dataloader_vis = DataLoader(dataset_val, batch_size=cfg.dataloader.batch_size,
+                                    collate_fn=collate_batched_meshes,
+                                    num_workers=cfg.dataloader.num_workers, shuffle=shuffle)
+        # datasets = [BehaveDataset(p, dataset_cfg.max_points, dataset_cfg.fix_sample,
+        #                            (dataset_cfg.image_size, dataset_cfg.image_size),
+        #                            split=s, sample_ratio_hum=dataset_cfg.sample_ratio_hum,
+        #                           normalize_type=dataset_cfg.normalize_type) for p, s in zip([train_paths, val_paths, val_paths],
+        #                                                                                                     ['train', 'val', 'test'])]
+        # dataloader_train = DataLoader(datasets[0], batch_size=cfg.dataloader.batch_size,
+        #                               collate_fn=collate_batched_meshes,
+        #                               num_workers=cfg.dataloader.num_workers, shuffle=True)
+        # dataloader_val = DataLoader(datasets[1], batch_size=cfg.dataloader.batch_size,
+        #                             collate_fn=collate_batched_meshes,
+        #                             num_workers=cfg.dataloader.num_workers, shuffle=False)
+        # dataloader_vis = DataLoader(datasets[2], batch_size=cfg.dataloader.batch_size,
+        #                             collate_fn=collate_batched_meshes,
+        #                             num_workers=cfg.dataloader.num_workers, shuffle=False)
+    elif cfg.dataset.type in ['shape']:
+        from .shape_dataset import ShapeDataset
+        from .behave_paths import DataPaths
+        from configs.structured import ShapeDatasetConfig
+        dataset_cfg: ShapeDatasetConfig = cfg.dataset
+        train_paths, _ = DataPaths.load_splits(dataset_cfg.split_file, dataset_cfg.behave_dir)
+        val_paths = train_paths # same as training, this is for overfitting
+        # split validation paths to only consider the selected batches
+        bs = cfg.dataloader.batch_size
+        num_batches_total = int(np.ceil(len(val_paths) / cfg.dataloader.batch_size))
+        end_idx = cfg.run.batch_end if cfg.run.batch_end is not None else num_batches_total
+        # print(cfg.run.batch_end, cfg.run.batch_start, end_idx)
+        val_paths = val_paths[cfg.run.batch_start * bs:end_idx * bs]
+        dataset_train = ShapeDataset(train_paths, dataset_cfg.max_points, dataset_cfg.fix_sample,
+                                   (dataset_cfg.image_size, dataset_cfg.image_size),
+                                   split='train', )
+        dataset_val = ShapeDataset(val_paths, dataset_cfg.max_points, dataset_cfg.fix_sample,
+                                  (dataset_cfg.image_size, dataset_cfg.image_size),
+                                  split='train', )
+        dataloader_train = DataLoader(dataset_train, batch_size=cfg.dataloader.batch_size,
+                                      collate_fn=collate_batched_meshes,
+                                      num_workers=cfg.dataloader.num_workers, shuffle=True)
+        shuffle = cfg.run.job == 'train'
+        dataloader_val = DataLoader(dataset_val, batch_size=cfg.dataloader.batch_size,
+                                    collate_fn=collate_batched_meshes,
+                                    num_workers=cfg.dataloader.num_workers, shuffle=shuffle)
+        dataloader_vis = DataLoader(dataset_val, batch_size=cfg.dataloader.batch_size,
+                                    collate_fn=collate_batched_meshes,
+                                    num_workers=cfg.dataloader.num_workers, shuffle=shuffle)
+    else:
+        raise NotImplementedError(cfg.dataset.type)
+    return dataloader_train, dataloader_val, dataloader_vis

dataset/base_data.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from os import path as osp
+import cv2
+import numpy as np
+from torch.utils.data import Dataset
+from dataset.img_utils import masks2bbox, resize, crop
+class BaseDataset(Dataset):
+    def __init__(self, data_paths, input_size=(224, 224)):
+        self.data_paths = data_paths # RGB image files
+        self.input_size = input_size
+        opencv2py3d = np.eye(4)
+        opencv2py3d[0, 0] = opencv2py3d[1, 1] = -1
+        self.opencv2py3d = opencv2py3d
+    def __len__(self):
+        return len(self.data_paths)
+    def load_masks(self, rgb_file):
+        person_mask_file = rgb_file.replace('.color.jpg', ".person_mask.png")
+        if not osp.isfile(person_mask_file):
+            person_mask_file = rgb_file.replace('.color.jpg', ".person_mask.jpg")
+        obj_mask_file = None
+        for pat in [".obj_rend_mask.png", ".obj_rend_mask.jpg", ".obj_mask.png", ".obj_mask.jpg", ".object_rend.png"]:
+            obj_mask_file = rgb_file.replace('.color.jpg', pat)
+            if osp.isfile(obj_mask_file):
+                break
+        person_mask = cv2.imread(person_mask_file, cv2.IMREAD_GRAYSCALE)
+        obj_mask = cv2.imread(obj_mask_file, cv2.IMREAD_GRAYSCALE)
+        return person_mask, obj_mask
+    def get_crop_params(self, mask_hum, mask_obj, bbox_exp=1.0):
+        "compute bounding box based on masks"
+        bmin, bmax = masks2bbox([mask_hum, mask_obj])
+        crop_center = (bmin + bmax) // 2
+        # crop_size = np.max(bmax - bmin)
+        crop_size = int(np.max(bmax - bmin) * bbox_exp)
+        if crop_size % 2 == 1:
+            crop_size += 1  # make sure it is an even number
+        return bmax, bmin, crop_center, crop_size
+    def is_behave_dataset(self, image_width):
+        assert image_width in [2048, 1920, 1024, 960], f'unknwon image width {image_width}!'
+        if image_width in [2048, 1024]:
+            is_behave = True
+        else:
+            is_behave = False
+        return is_behave
+    def compute_K_roi(self, bbox_square,
+                      image_width=2048,
+                      image_height=1536,
+                      fx=979.7844, fy=979.840,
+                      cx=1018.952, cy=779.486):
+        "return results in ndc coordinate, this is correct!!!"
+        x, y, b, w = bbox_square
+        assert b == w
+        is_behave = self.is_behave_dataset(image_width)
+        if is_behave:
+            assert image_height / image_width == 0.75, f"invalid image aspect ratio: width={image_width}, height={image_height}"
+            # the image might be rendered at different size
+            ratio = image_width/2048.
+            fx, fy = 979.7844*ratio, 979.840*ratio
+            cx, cy = 1018.952*ratio, 779.486*ratio
+        else:
+            assert image_height / image_width == 9/16, f"invalid image aspect ratio: width={image_width}, height={image_height}"
+            # intercap camera
+            ratio = image_width/1920
+            fx, fy = 918.457763671875*ratio, 918.4373779296875*ratio
+            cx, cy = 956.9661865234375*ratio, 555.944580078125*ratio
+        cx, cy = cx - x, cy - y
+        scale = b/2.
+        # in ndc
+        cx_ = (scale - cx)/scale
+        cy_ = (scale - cy)/scale
+        fx_ = fx/scale
+        fy_ = fy/scale
+        K_roi = np.array([
+            [fx_, 0, cx_, 0],
+            [0., fy_, cy_, 0, ],
+            [0, 0, 0, 1.],
+            [0, 0, 1, 0]
+        ])
+        return K_roi
+    def crop_full_image(self, mask_hum, mask_obj, rgb_full, crop_masks, bbox_exp=1.0):
+        """
+        crop the image based on the given masks
+        :param mask_hum:
+        :param mask_obj:
+        :param rgb_full:
+        :param crop_masks: a list of masks used to do the crop
+        :return: Kroi, cropped human, object mask and RGB images (background masked out).
+        """
+        bmax, bmin, crop_center, crop_size = self.get_crop_params(*crop_masks, bbox_exp)
+        rgb = resize(crop(rgb_full, crop_center, crop_size), self.input_size) / 255.
+        person_mask = resize(crop(mask_hum, crop_center, crop_size), self.input_size) / 255.
+        obj_mask = resize(crop(mask_obj, crop_center, crop_size), self.input_size) / 255.
+        xywh = np.concatenate([crop_center - crop_size // 2, np.array([crop_size, crop_size])])
+        Kroi = self.compute_K_roi(xywh, rgb_full.shape[1], rgb_full.shape[0])
+        # mask bkg out
+        mask_comb = (person_mask > 0.5) | (obj_mask > 0.5)
+        rgb = rgb * np.expand_dims(mask_comb, -1)
+        return Kroi, obj_mask, person_mask, rgb

dataset/behave_paths.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import glob
+import os, re
+import pickle as pkl
+from os.path import join, basename, dirname, isfile
+import os.path as osp
+import cv2, json
+import numpy as np
+# PROCESSED_PATH = paths['PROCESSED_PATH']
+BEHAVE_PATH = "/BS/xxie-5/static00/behave_release/sequences/"
+RECON_PATH = "/BS/xxie-5/static00/behave-train"
+class DataPaths:
+    """
+    class to handle path operations based on BEHAVE dataset structure
+    """
+    def __init__(self):
+        pass
+    @staticmethod
+    def load_splits(split_file, dataset_path=None):
+        assert os.path.exists(dataset_path), f'the given dataset path {dataset_path} does not exist, please check if your training data are placed over there!'
+        train, val = DataPaths.get_train_test_from_pkl(split_file)
+        return train, val
+        # print(train[:5], val[:5])
+        if isinstance(train[0], list):
+            # video data
+            train_full = [[join(dataset_path, seq[x]) for x in range(len(seq))] for seq in train]
+            val_full = [[join(dataset_path, seq[x]) for x in range(len(seq))] for seq in val]
+        else:
+            train_full = [join(dataset_path, x) for x in train] # full path to the training data
+            val_full = [join(dataset_path, x) for x in val] # full path to the validation data files
+        # print(train_full[:5], val_full[:5])
+        return train_full, val_full
+    @staticmethod
+    def load_splits_online(split_file, dataset_path=BEHAVE_PATH):
+        "load rgb file, smpl and object mesh paths"
+        keys = ['rgb', 'smpl', 'obj']
+        types = ['train', 'val']
+        splits = {}
+        data = pkl.load(open(split_file, 'rb'))
+        for type in types:
+            for key in keys:
+                k = f'{type}_{key}'
+                splits[k] = [join(dataset_path, x) for x in data[k]]
+        return splits
+    @staticmethod
+    def get_train_test_from_pkl(pkl_file):
+        data = pkl.load(open(pkl_file, 'rb'))
+        return data['train'], data['test']
+    @staticmethod
+    def get_image_paths_seq(seq, tid=1, check_occlusion=False, pat='t*.000'):
+        """
+        find all image paths in one sequence
+        :param seq: path to one behave sequence
+        :param tid: test on images from which camera
+        :param check_occlusion: whether to load full object mask and check occlusion ratio
+        :return: a list of paths to test image files
+        """
+        image_files = sorted(glob.glob(seq + f"/{pat}/k{tid}.color.jpg"))
+        # print(image_files, seq + f"/{pat}/k{tid}.color.jpg")
+        if not check_occlusion:
+            return image_files
+        # check object occlusion ratio
+        valid_files = []
+        count = 0
+        for img_file in image_files:
+            mask_file = img_file.replace('.color.jpg', '.obj_rend_mask.png')
+            if not os.path.isfile(mask_file):
+                mask_file = img_file.replace('.color.jpg', '.obj_rend_mask.jpg')
+            full_mask_file = img_file.replace('.color.jpg', '.obj_rend_full.png')
+            if not os.path.isfile(full_mask_file):
+                full_mask_file = img_file.replace('.color.jpg', '.obj_rend_full.jpg')
+            if not isfile(mask_file) or not isfile(full_mask_file):
+                continue
+            mask = np.sum(cv2.imread(mask_file, cv2.IMREAD_GRAYSCALE) > 127)
+            mask_full = np.sum(cv2.imread(full_mask_file, cv2.IMREAD_GRAYSCALE) > 127)
+            if mask_full == 0:
+                count += 1
+                continue
+            ratio = mask / mask_full
+            if ratio > 0.3:
+                valid_files.append(img_file)
+            else:
+                count += 1
+                print(f'{mask_file} occluded by {1 - ratio}!')
+        return valid_files
+    @staticmethod
+    def get_kinect_id(rgb_file):
+        "extract kinect id from the rgb file"
+        filename = osp.basename(rgb_file)
+        try:
+            kid = int(filename.split('.')[0][1])
+            assert kid in [0, 1, 2, 3, 4, 5], f'found invalid kinect id {kid} for file {rgb_file}'
+            return kid
+        except Exception as e:
+            print(rgb_file)
+            raise ValueError()
+    @staticmethod
+    def get_seq_date(rgb_file):
+        "date for the sequence"
+        seq_name = str(rgb_file).split(os.sep)[-3]
+        date = seq_name.split('_')[0]
+        assert date in ['Date01', 'Date02', 'Date03', 'Date04', 'Date05', 'Date06', 'Date07',
+                        "ICapS01", "ICapS02", "ICapS03", "Date08", "Date09"], f"invalid date for {rgb_file}"
+        return date
+    @staticmethod
+    def rgb2obj_path(rgb_file:str, save_name='fit01-smooth'):
+        "convert an rgb file to a obj mesh file"
+        ss = rgb_file.split(os.sep)
+        seq_name = ss[-3]
+        obj_name = seq_name.split('_')[2]
+        real_name = obj_name
+        if 'chair' in obj_name:
+            real_name = 'chair'
+        if 'ball' in obj_name:
+            real_name = 'sports ball'
+        frame_folder = osp.dirname(rgb_file)
+        mesh_file = osp.join(frame_folder, real_name, save_name, f'{real_name}_fit.ply')
+        if not osp.isfile(mesh_file):
+            # synthetic data
+            mesh_file = osp.join(frame_folder, obj_name, save_name, f'{obj_name}_fit.ply')
+        return mesh_file
+    @staticmethod
+    def rgb2smpl_path(rgb_file:str, save_name='fit03'):
+        frame_folder = osp.dirname(rgb_file)
+        real_name = 'person'
+        mesh_file = osp.join(frame_folder, real_name, save_name, f'{real_name}_fit.ply')
+        return mesh_file
+    @staticmethod
+    def rgb2seq_frame(rgb_file:str):
+        "rgb file to seq_name, frame time"
+        ss = rgb_file.split(os.sep)
+        return ss[-3], ss[-2]
+    @staticmethod
+    def rgb2recon_folder(rgb_file, save_name, recon_path):
+        "convert rgb file to the subfolder"
+        dataset_path = osp.dirname(osp.dirname(osp.dirname(rgb_file)))
+        recon_folder = osp.join(osp.dirname(rgb_file.replace(dataset_path, recon_path)), save_name)
+        return recon_folder
+    @staticmethod
+    def get_seq_name(rgb_file):
+        return osp.basename(osp.dirname(osp.dirname(rgb_file)))
+    @staticmethod
+    def rgb2template_path(rgb_file):
+        "return the path to the object template"
+        from recon.opt_utils import get_template_path
+        # seq_name = DataPaths.get_seq_name(rgb_file)
+        # obj_name = seq_name.split('_')[2]
+        obj_name = DataPaths.rgb2object_name(rgb_file)
+        path = get_template_path(BEHAVE_PATH+"/../objects", obj_name)
+        return path
+    @staticmethod
+    def rgb2object_name(rgb_file):
+        seq_name = DataPaths.get_seq_name(rgb_file)
+        obj_name = seq_name.split('_')[2]
+        return obj_name
+    @staticmethod
+    def rgb2recon_frame(rgb_file, recon_path=RECON_PATH):
+        "return the frame folder in recon path"
+        ss = rgb_file.split(os.sep)
+        seq_name, frame = ss[-3], ss[-2]
+        return osp.join(recon_path, seq_name, frame)
+    @staticmethod
+    def rgb2gender(rgb_file):
+        "find the gender of this image"
+        seq_name = str(rgb_file).split(os.sep)[-3]
+        sub = seq_name.split('_')[1]
+        return _sub_gender[sub]
+    @staticmethod
+    def get_dataset_root(rgb_file):
+        "return the root path to all sequences"
+        from pathlib import Path
+        path = Path(rgb_file)
+        return str(path.parents[2])
+    @staticmethod
+    def seqname2gender(seq_name:str):
+        sub = seq_name.split('_')[1]
+        return _sub_gender[sub]
+ICAP_PATH = "/BS/xxie-6/static00/InterCap" # assume same root folder
+date_seqs = {
+    "Date01": BEHAVE_PATH + "/Date01_Sub01_backpack_back",
+    "Date02": BEHAVE_PATH + "/Date02_Sub02_backpack_back",
+    "Date03": BEHAVE_PATH + "/Date03_Sub03_backpack_back",
+    "Date04": BEHAVE_PATH + "/Date04_Sub05_backpack",
+    "Date05": BEHAVE_PATH + "/Date05_Sub05_backpack",
+    "Date06": BEHAVE_PATH + "/Date06_Sub07_backpack_back",
+    "Date07": BEHAVE_PATH + "/Date07_Sub04_backpack_back",
+    # "Date08": "/BS/xxie-6/static00/synthesize/Date08_Subxx_chairwood_synzv2-02",
+    "Date08": "/BS/xxie-6/static00/synz-backup/Date08_Subxx_chairwood_synzv2-02",
+    "Date09": "/BS/xxie-6/static00/synthesize/Date09_Subxx_obj01_icap", # InterCap sequence synz
+    "ICapS01": ICAP_PATH + "/ICapS01_sub01_obj01_Seg_0",
+    "ICapS02": ICAP_PATH + "/ICapS02_sub01_obj08_Seg_0",
+    "ICapS03": ICAP_PATH + "/ICapS03_sub07_obj05_Seg_0",
+}
+_sub_gender = {
+"Sub01": 'male',
+"Sub02": 'male',
+"Sub03": 'male',
+"Sub04": 'male',
+"Sub05": 'male',
+"Sub06": 'female',
+"Sub07": 'female',
+"Sub08": 'female',
+}

dataset/demo_dataset.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import os
+import numpy as np
+import cv2
+import torch
+from .base_data import BaseDataset
+from .behave_paths import DataPaths
+from .img_utils import compute_translation, masks2bbox, crop
+def padTo_4x3(rgb, person_mask, obj_mask, aspect_ratio=0.75):
+    """
+    pad images to have 4:3 aspect ratio
+    :param rgb: (H, W, 3)
+    :param person_mask:
+    :param obj_mask:
+    :return: all images at the given aspect ratio
+    """
+    h, w = rgb.shape[:2]
+    if w > h * 1/aspect_ratio:
+        # pad top
+        h_4x3 = int(w * aspect_ratio)
+        pad_top = h_4x3 - h
+        rgb_pad = np.pad(rgb, ((pad_top, 0), (0, 0), (0, 0)))
+        person_mask = np.pad(person_mask, ((pad_top, 0), (0, 0))) if person_mask is not None else None
+        obj_mask = np.pad(obj_mask, ((pad_top, 0), (0, 0))) if obj_mask is not None else None
+    else:
+        # pad two side
+        w_new = np.lcm.reduce([h * 2, 16]) # least common multiplier
+        h_4x3 = int(w_new * aspect_ratio)
+        pad_top = h_4x3 - h
+        pad_left = (w_new - w) // 2
+        pad_right = w_new - w - pad_left
+        rgb_pad = np.pad(rgb, ((pad_top, 0), (pad_left, pad_right), (0, 0)))
+        obj_mask = np.pad(obj_mask, ((pad_top, 0), (pad_left, pad_right))) if obj_mask is not None else None
+        person_mask = np.pad(person_mask, ((pad_top, 0), (pad_left, pad_right))) if person_mask is not None else None
+    return rgb_pad, obj_mask, person_mask
+def recrop_input(rgb, person_mask, obj_mask, dataset_name='behave'):
+    "recrop input images"
+    exp_ratio = 1.42
+    if dataset_name == 'behave':
+        mean_center = np.array([1008, 995])  # mean RGB image crop center
+        behave_size = (2048, 1536)
+        new_size = (int(750 * exp_ratio), int(exp_ratio * 750))
+    else:
+        mean_center = np.array([904, 668])  # mean RGB image crop center for bottle sequences of ICAP
+        behave_size = (1920, 1080)
+        new_size = (int(593.925 * exp_ratio), int(exp_ratio * 593.925))  # mean width of bottle sequences
+    aspect_ratio = behave_size[1] / behave_size[0]
+    pad_top = mean_center[1] - new_size[0] // 2
+    pad_bottom = behave_size[1] - (mean_center[1] + new_size[0] // 2)
+    pad_left = mean_center[0] - new_size[0] // 2
+    pad_right = behave_size[0] - (mean_center[0] + new_size[0] // 2)
+    # First resize to the same aspect ratio
+    if rgb.shape[0] / rgb.shape[1] != aspect_ratio:
+        rgb, obj_mask, person_mask = padTo_4x3(rgb, person_mask, obj_mask, aspect_ratio)
+    # Resize to the same size as behave image, to have a comparable pixel size
+    rgb = cv2.resize(rgb, behave_size)
+    mask_ps = cv2.resize(person_mask, behave_size)
+    mask_obj = cv2.resize(obj_mask, behave_size)
+    # Crop and resize the human + object patch
+    bmin, bmax = masks2bbox([mask_ps, mask_obj])
+    center = (bmin + bmax) // 2
+    crop_size = int(np.max(bmax - bmin) * exp_ratio)  # larger crop to have background
+    img_crop = cv2.resize(crop(rgb, center, crop_size), new_size)
+    mask_ps = cv2.resize(crop(mask_ps, center, crop_size), new_size)
+    mask_obj = cv2.resize(crop(mask_obj, center, crop_size), new_size)
+    # Pad back to have same shape as behave image
+    img_full = np.pad(img_crop, [[pad_top, pad_bottom], [pad_left, pad_right], [0, 0]])
+    mask_ps_full = np.pad(mask_ps, [[pad_top, pad_bottom], [pad_left, pad_right]])
+    mask_obj_full = np.pad(mask_obj, [[pad_top, pad_bottom], [pad_left, pad_right]])
+    # Make sure the image shape is the same
+    if img_full.shape[:2] != behave_size[::-1]:
+        img_full = cv2.resize(img_full, behave_size)
+        mask_ps_full = cv2.resize(mask_ps_full, behave_size)
+        mask_obj_full = cv2.resize(mask_obj_full, behave_size)
+    return img_full, mask_ps_full, mask_obj_full
+class DemoDataset(BaseDataset):
+    def __init__(self, data_paths, input_size=(224, 224),
+                 std_coverage=3.5, # used to estimate camera translation
+                 ):
+        super().__init__(data_paths, input_size)
+        self.std_coverage = std_coverage
+    def __len__(self):
+        return len(self.data_paths)
+    def __getitem__(self, idx):
+        rgb_file = self.data_paths[idx]
+        mask_hum, mask_obj = self.load_masks(rgb_file)
+        rgb_full = cv2.imread(rgb_file)[:, :, ::-1]
+        return self.image2dict(mask_hum, mask_obj, rgb_full, rgb_file)
+    def image2dict(self, mask_hum, mask_obj, rgb_full, rgb_file=None):
+        "do all the necessary preprocessing for images"
+        if rgb_full.shape[:2] != mask_obj.shape[:2]:
+            raise ValueError(f"The given object mask shape {mask_obj.shape[:2]} does not match the RGB image shape {rgb_full.shape[:2]}")
+        if rgb_full.shape[:2] != mask_hum.shape[:2]:
+            raise ValueError(f"The given human mask shape {mask_hum.shape[:2]} does not match the RGB image shape {rgb_full.shape[:2]}")
+        if rgb_full.shape[:2] not in [(1080, 1920), (1536, 2048)]:
+            # crop and resize the image to behave image size
+            print(f"Recropping the input image and masks for {rgb_file}")
+            rgb_full, mask_hum, mask_obj = recrop_input(rgb_full, mask_hum, mask_obj)
+        color_h, color_w = rgb_full.shape[:2]
+        # Input to the first stage model: human + object crop
+        Kroi, objmask_fullcrop, psmask_fullcrop, rgb_fullcrop = self.crop_full_image(mask_hum.copy(),
+                                                                                     mask_obj.copy(),
+                                                                                     rgb_full.copy(),
+                                                                                     [mask_hum, mask_obj],
+                                                                                     1.00)
+        # Input to the second stage model: human and object crops
+        Kroi_h, masko_hum, maskh_hum, rgb_hum = self.crop_full_image(mask_hum.copy(),
+                                                                     mask_obj.copy(),
+                                                                     rgb_full.copy(),
+                                                                     [mask_hum, mask_hum], 1.05)
+        Kroi_o, masko_obj, maskh_obj, rgb_obj = self.crop_full_image(mask_hum.copy(),
+                                                                     mask_obj.copy(),
+                                                                     rgb_full.copy(),
+                                                                     [mask_obj, mask_obj], 1.5)
+        # Estimate camera translation
+        cent_transform = np.eye(4)  # the transform applied to the mesh that moves it back to kinect camera frame
+        bmin_ho, bmax_ho = masks2bbox([mask_hum, mask_obj])
+        crop_size_ho = int(np.max(bmax_ho - bmin_ho) * 1.0)
+        if crop_size_ho % 2 == 1:
+            crop_size_ho += 1  # make sure it is an even number
+        is_behave = self.is_behave_dataset(rgb_full.shape[1])
+        if rgb_full.shape[1] not in [2048, 1920]:
+            raise ValueError('the image is not normalized to BEHAVE or ICAP size!')
+        indices = np.indices(rgb_full.shape[:2])
+        if np.sum(mask_obj > 127) < 5:
+            raise ValueError(f'not enough object mask found for {rgb_file}')
+        pts_h = np.stack([indices[1][mask_hum > 127], indices[0][mask_hum > 127]], -1)
+        pts_o = np.stack([indices[1][mask_obj > 127], indices[0][mask_obj > 127]], -1)
+        proj_cent_est = (np.mean(pts_h, 0) + np.mean(pts_o, 0)) / 2.  # heuristic to obtain 2d projection center
+        transl_estimate = compute_translation(proj_cent_est, crop_size_ho, is_behave, self.std_coverage)
+        cent_transform[:3, 3] = transl_estimate / 7.0
+        radius = 0.5  # don't do normalization anymore
+        cent = transl_estimate / 7.0
+        comb = np.matmul(self.opencv2py3d, cent_transform)
+        R = torch.from_numpy(comb[:3, :3]).float()
+        T = torch.from_numpy(comb[:3, 3]).float() / (radius * 2)
+        data_dict = {
+            "R": R,
+            "T": T,
+            "K": torch.from_numpy(Kroi).float(),
+            "T_ho": torch.from_numpy(cent).float(),  # translation for H+O
+            "image_path": rgb_file,
+            "image_size_hw": torch.tensor(self.input_size),
+            "images": torch.from_numpy(rgb_fullcrop).float().permute(2, 0, 1),
+            "masks": torch.from_numpy(np.stack([psmask_fullcrop, objmask_fullcrop], 0)).float(),
+            'orig_image_size': torch.tensor([color_h, color_w]),
+            # Human input to stage 2
+            "images_hum": torch.from_numpy(rgb_hum).float().permute(2, 0, 1),
+            "masks_hum": torch.from_numpy(np.stack([maskh_hum, masko_hum], 0)).float(),
+            "K_hum": torch.from_numpy(Kroi_h).float(),
+            # Object input to stage 2
+            "images_obj": torch.from_numpy(rgb_obj).float().permute(2, 0, 1),
+            "masks_obj": torch.from_numpy(np.stack([maskh_obj, masko_obj], 0)).float(),
+            "K_obj": torch.from_numpy(Kroi_o).float(),
+            # some normalization parameters
+            "gt_trans": cent,
+            'radius': radius,
+            "estimated_trans": transl_estimate,
+        }
+        return data_dict
+    def image2batch(self, rgb, mask_hum, mask_obj):
+        """
+        given input image, convert it into a batch object ready for model inference
+        :param rgb: (h, w, 3), np array
+        :param mask_hum: (h, w, 3), np array
+        :param mask_obj: (h, w, 3), np array
+        :return:
+        """
+        mask_hum = np.mean(mask_hum, -1)
+        mask_obj = np.mean(mask_obj, -1)
+        data_dict = self.image2dict(mask_hum, mask_obj, rgb, 'input image')
+        # convert dict to list
+        new_dict = {k:[v] for k, v in data_dict.items()}
+        return new_dict

dataset/img_utils.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+common functions for image operations
+"""
+import cv2
+import numpy as np
+def crop(img, center, crop_size):
+    """
+    crop image around the given center, pad zeros for borders
+    :param img:
+    :param center: np array
+    :param crop_size: np array or a float size of the resulting crop
+    :return: a square crop around the center
+    """
+    assert isinstance(img, np.ndarray)
+    h, w = img.shape[:2]
+    topleft = np.round(center - crop_size / 2).astype(int)
+    bottom_right = np.round(center + crop_size / 2).astype(int)
+    x1 = max(0, topleft[0])
+    y1 = max(0, topleft[1])
+    x2 = min(w - 1, bottom_right[0])
+    y2 = min(h - 1, bottom_right[1])
+    cropped = img[y1:y2, x1:x2]
+    p1 = max(0, -topleft[0])  # padding in x, top
+    p2 = max(0, -topleft[1])  # padding in y, top
+    p3 = max(0, bottom_right[0] - w + 1)  # padding in x, bottom
+    p4 = max(0, bottom_right[1] - h + 1)  # padding in y, bottom
+    dim = len(img.shape)
+    if dim == 3:
+        padded = np.pad(cropped, [[p2, p4], [p1, p3], [0, 0]])
+    elif dim == 2:
+        padded = np.pad(cropped, [[p2, p4], [p1, p3]])
+    else:
+        raise NotImplemented
+    return padded
+def resize(img, img_size, mode=cv2.INTER_LINEAR):
+    """
+    resize image to the input
+    :param img:
+    :param img_size: (width, height) of the target image size
+    :param mode:
+    :return:
+    """
+    h, w = img.shape[:2]
+    load_ratio = 1.0 * w / h
+    netin_ratio = 1.0 * img_size[0] / img_size[1]
+    assert load_ratio == netin_ratio, "image aspect ration not matching, given image: {}, net input: {}".format(
+        img.shape, img_size)
+    resized = cv2.resize(img, img_size, interpolation=mode)
+    return resized
+def masks2bbox(masks, threshold=127):
+    """
+    :param masks:
+    :param threshold:
+    :return: bounding box corner coordinate
+    """
+    mask_comb = np.zeros_like(masks[0], dtype=bool)
+    for m in masks:
+        mask_comb = mask_comb | (m > threshold)
+    yid, xid = np.where(mask_comb)
+    bmin = np.array([xid.min(), yid.min()])
+    bmax = np.array([xid.max(), yid.max()])
+    return bmin, bmax
+def compute_translation(crop_center, crop_size, is_behave=True, std_coverage=3.5):
+    """
+    solve for an optimal translation that project gaussian in origin to the crop
+    Parameters
+    ----------
+    crop_center: (x, y) of the crop center
+    crop_size: float, the size of the square crop
+    std_coverage: which edge point should be projected back to the edge of the 2d crop
+    Returns
+    -------
+    the estimated translation
+    """
+    x0, y0 = crop_center
+    x1, y1 = x0 + crop_size/2, y0
+    x2, y2 = x0 - crop_size/2, y0
+    x3, y3 = x0, y0 + crop_size/2.
+    # predefined kinect intrinsics
+    if is_behave:
+        fx = 979.7844
+        fy = 979.840
+        cx = 1018.952
+        cy = 779.486
+    else:
+        # intercap camera
+        fx, fy = 918.457763671875, 918.4373779296875
+        cx, cy = 956.9661865234375, 555.944580078125
+    # construct the matrix
+    # A = np.array([
+    #     [fx, 0, cx-x0, cx-x0,  0,  0],
+    #     [0, fy, cy-y0, cy-y0,  0,  0],
+    #     [fx, 0, cx-x1,   0, cx-x1, 0],
+    #     [0, fy, cy-y1,   0, cy-y1, 0],
+    #     [fx, 0, cx-x2,   0,  0,    cx-x2],
+    #     [0, fy, cy-y2,   0,  0,    cy-y2]
+    # ]) # this matrix is low-rank because columns are linearly dependent: col3 - col4 = col5 + col6
+    # # find linearly dependent rows
+    # lambdas, V = np.linalg.eig(A)
+    # # print()
+    # # The linearly dependent row vectors
+    # print(lambdas == 0, np.linalg.det(A), A[lambdas == 0, :]) # some have determinant zero, some don't??
+    # print(np.linalg.inv(A))
+    # A = np.array([
+    #     [fx, 0, cx - x0, cx - x0, 0, 0],
+    #     [0, fy, cy - y0, cy - y0, 0, 0],
+    #     [fx, 0, cx - x1, 0, cx - x1, 0],
+    #     [0, fy, cy - y1, 0, cy - y1, 0],
+    #     [fx, 0, cx - x3, 0, 0, cx - x3],
+    #     [0, fy, cy - y3, 0, 0, cy - y3]
+    # ]) # this is also low rank!
+    # b = np.array([0, 0, -3*fx, 0, 0, -3*fy]).reshape((-1, 1))
+    # print("rank of the coefficient matrix:", np.linalg.matrix_rank(A))  # rank is 5! underconstrained matrix!
+    # x = np.matmul(np.linalg.inv(A), b)
+    # fix z0 as 0, then A is a full-rank matrix
+    # first two equations: origin (0, 0, 0) is projected to the crop center
+    # last two equations: edge point (3.5, 0, z) is projected to the edge of crop
+    A = np.array([
+        [fx, 0, cx-x0, cx-x0],
+        [0, fy, cy-y0, cy-y0],
+        [fx, 0, fx-x1,   0],
+        [0, fy, cy-y1,   0]
+    ])
+    # b = np.array([0, 0, -3.5*fx, 0]).reshape((-1, 1)) # 3.5->half of 7.0
+    b = np.array([0, 0, -std_coverage * fx, 0]).reshape((-1, 1))  # 3.5->half of 7.0
+    x = np.matmul(np.linalg.inv(A), b) # use 4 or 5 does not really matter, same results
+    # A is always a full-rank matrix
+    return x.flatten()[:3]

demo.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""
+Demo for template-free reconstruction
+python demo.py model=ho-attn run.image_path=/BS/xxie-2/work/HDM/outputs/000000017450/k1.color.jpg run.job=sample model.predict_binary=True dataset.std_coverage=3.0
+"""
+import pickle as pkl
+import sys, os
+import os.path as osp
+from typing import Iterable, Optional
+import cv2
+from accelerate import Accelerator
+from tqdm import tqdm
+from glob import glob
+sys.path.append(os.getcwd())
+import hydra
+import torch
+import numpy as np
+import imageio
+from torch.utils.data import DataLoader
+from pytorch3d.datasets import R2N2, collate_batched_meshes
+from pytorch3d.structures import Pointclouds
+from pytorch3d.renderer import PerspectiveCameras, look_at_view_transform
+from pytorch3d.io import IO
+import torchvision.transforms.functional as TVF
+from huggingface_hub import hf_hub_download
+import training_utils
+from configs.structured import ProjectConfig
+from dataset.demo_dataset import DemoDataset
+from model import CrossAttenHODiffusionModel, ConditionalPCDiffusionSeparateSegm
+from render.pyt3d_wrapper import PcloudRenderer
+class DemoRunner:
+    def __init__(self, cfg: ProjectConfig):
+        cfg.model.model_name, cfg.model.predict_binary = 'pc2-diff-ho-sepsegm', True
+        model_stage1 = ConditionalPCDiffusionSeparateSegm(**cfg.model)
+        cfg.model.model_name, cfg.model.predict_binary = 'diff-ho-attn', False # stage 2 does not predict segmentation
+        model_stage2 = CrossAttenHODiffusionModel(**cfg.model)
+        # Load from checkpoint
+        # ckpt_file1 = os.path.join(cfg.run.code_dir_abs, f'outputs/{cfg.run.stage1_name}/single/checkpoint-latest.pth')
+        # self.load_checkpoint(ckpt_file1, model_stage1)
+        # ckpt_file2 = os.path.join(cfg.run.code_dir_abs, f'outputs/{cfg.run.stage2_name}/single/checkpoint-latest.pth')
+        # self.load_checkpoint(ckpt_file2, model_stage2)
+        # Load ckpt from hf
+        ckpt_file1 = hf_hub_download("xiexh20/HDM-models", f'{cfg.run.stage1_name}.pth')
+        self.load_checkpoint(ckpt_file1, model_stage1)
+        ckpt_file2 = hf_hub_download("xiexh20/HDM-models", f'{cfg.run.stage2_name}.pth')
+        self.load_checkpoint(ckpt_file2, model_stage2)
+        self.model_stage1, self.model_stage2 = model_stage1, model_stage2
+        self.model_stage1.eval()
+        self.model_stage2.eval()
+        self.model_stage1.to('cuda')
+        self.model_stage2.to('cuda')
+        self.cfg = cfg
+        self.io_pc = IO()
+        # For visualization
+        self.renderer = PcloudRenderer(image_size=cfg.dataset.image_size, radius=0.0075)
+        self.rend_size = cfg.dataset.image_size
+        self.device = 'cuda'
+    def load_checkpoint(self, ckpt_file1, model_stage1):
+        checkpoint = torch.load(ckpt_file1, map_location='cpu')
+        state_dict, key = checkpoint['model'], 'model'
+        if any(k.startswith('module.') for k in state_dict.keys()):
+            state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
+            print('Removed "module." from checkpoint state dict')
+        missing_keys, unexpected_keys = model_stage1.load_state_dict(state_dict, strict=False)
+        print(f'Loaded model checkpoint {key} from {ckpt_file1}')
+        if len(missing_keys):
+            print(f' - Missing_keys: {missing_keys}')
+        if len(unexpected_keys):
+            print(f' - Unexpected_keys: {unexpected_keys}')
+    @torch.no_grad()
+    def run(self):
+        "simply run the demo on given images, and save the results"
+        # Set random seed
+        training_utils.set_seed(self.cfg.run.seed)
+        outdir = osp.join(self.cfg.run.code_dir_abs, 'outputs/demo')
+        os.makedirs(outdir, exist_ok=True)
+        cfg = self.cfg
+        # Init data
+        image_files = sorted(glob(cfg.run.image_path))
+        data = DemoDataset(image_files,
+                           (cfg.dataset.image_size, cfg.dataset.image_size),
+                           cfg.dataset.std_coverage)
+        dataloader = DataLoader(data, batch_size=cfg.dataloader.batch_size,
+                                collate_fn=collate_batched_meshes,
+                                num_workers=1, shuffle=False)
+        dataloader = dataloader
+        progress_bar = tqdm(dataloader)
+        for batch_idx, batch in enumerate(progress_bar):
+            progress_bar.set_description(f'Processing batch {batch_idx:4d} / {len(dataloader):4d}')
+            out_stage1, out_stage2 = self.forward_batch(batch, cfg)
+            bs = len(out_stage1)
+            camera_full = PerspectiveCameras(
+                R=torch.stack(batch['R']),
+                T=torch.stack(batch['T']),
+                K=torch.stack(batch['K']),
+                device='cuda',
+                in_ndc=True)
+            # save output
+            for i in range(bs):
+                image_path = str(batch['image_path'])
+                folder, fname = osp.basename(osp.dirname(image_path)), osp.splitext(osp.basename(image_path))[0]
+                out_i = osp.join(outdir, folder)
+                os.makedirs(out_i, exist_ok=True)
+                self.io_pc.save_pointcloud(data=out_stage1[i],
+                                           path=osp.join(out_i, f'{fname}_stage1.ply'))
+                self.io_pc.save_pointcloud(data=out_stage2[i],
+                                           path=osp.join(out_i, f'{fname}_stage2.ply'))
+                TVF.to_pil_image(batch['images'][i]).save(osp.join(out_i, f'{fname}_input.png'))
+                # Save metadata as well
+                metadata = dict(index=i,
+                                camera=camera_full[i],
+                                image_size_hw=batch['image_size_hw'][i],
+                                image_path=batch['image_path'][i])
+                torch.save(metadata, osp.join(out_i, f'{fname}_meta.pth'))
+                # Visualize
+                # front_camera = camera_full[i]
+                pc_comb = Pointclouds([out_stage1[i].points_packed(), out_stage2[i].points_packed()],
+                                      features=[out_stage1[i].features_packed(), out_stage2[i].features_packed()])
+                video_file = osp.join(out_i, f'{fname}_360view.mp4')
+                video_writer = imageio.get_writer(video_file, format='FFMPEG', mode='I', fps=1)
+                # first render front view
+                rend_stage1, _ = self.renderer.render(out_stage1[i], camera_full[i], mode='mask')
+                rend_stage2, _ = self.renderer.render(out_stage2[i], camera_full[i], mode='mask')
+                comb = np.concatenate([batch['images'][i].permute(1, 2, 0).cpu().numpy(), rend_stage1, rend_stage2], 1)
+                video_writer.append_data((comb*255).astype(np.uint8))
+                for azim in range(180, 180+360, 30):
+                    R, T = look_at_view_transform(1.7, 0, azim, up=((0, -1, 0),), )
+                    side_camera = PerspectiveCameras(image_size=((self.rend_size, self.rend_size),),
+                                              device=self.device,
+                                              R=R.repeat(2, 1, 1), T=T.repeat(2, 1),
+                                              focal_length=self.rend_size * 1.5,
+                                              principal_point=((self.rend_size / 2., self.rend_size / 2.),),
+                                              in_ndc=False)
+                    rend, mask = self.renderer.render(pc_comb, side_camera, mode='mask')
+                    imgs = [batch['images'][i].permute(1, 2, 0).cpu().numpy()]
+                    imgs.extend([rend[0], rend[1]])
+                    video_writer.append_data((np.concatenate(imgs, 1)*255).astype(np.uint8))
+                print(f"Visualization saved to {out_i}")
+    @torch.no_grad()
+    def forward_batch(self, batch, cfg):
+        """
+        forward one batch
+        :param batch:
+        :param cfg:
+        :return: predicted point clouds of stage 1 and 2
+        """
+        camera_full = PerspectiveCameras(
+            R=torch.stack(batch['R']),
+            T=torch.stack(batch['T']),
+            K=torch.stack(batch['K']),
+            device='cuda',
+            in_ndc=True)
+        out_stage1 = self.model_stage1.forward_sample(num_points=cfg.dataset.max_points,
+                                                      camera=camera_full,
+                                                      image_rgb=torch.stack(batch['images']).to('cuda'),
+                                                      mask=torch.stack(batch['masks']).to('cuda'),
+                                                      scheduler=cfg.run.diffusion_scheduler,
+                                                      num_inference_steps=cfg.run.num_inference_steps,
+                                                      )
+        # segment and normalize human/object
+        bs = len(out_stage1)
+        pred_hum, pred_obj = [], []  # predicted human/object points
+        cent_hum_pred, cent_obj_pred = [], []
+        radius_hum_pred, radius_obj_pred = [], []
+        T_hum, T_obj = [], []
+        num_samples = int(cfg.dataset.max_points / 2)
+        for i in range(bs):
+            pc: Pointclouds = out_stage1[i]
+            vc = pc.features_packed().cpu()  # (P, 3), human is light blue [0.1, 1.0, 1.0], object light green [0.5, 1.0, 0]
+            points = pc.points_packed().cpu()  # (P, 3)
+            mask_hum = vc[:, 2] > 0.5
+            pc_hum, pc_obj = points[mask_hum], points[~mask_hum]
+            # Up/Down-sample the points
+            pc_obj = self.upsample_predicted_pc(num_samples, pc_obj)
+            pc_hum = self.upsample_predicted_pc(num_samples, pc_hum)
+            # Normalize
+            cent_hum, cent_obj = torch.mean(pc_hum, 0, keepdim=True), torch.mean(pc_obj, 0, keepdim=True)
+            scale_hum = torch.sqrt(torch.sum((pc_hum - cent_hum) ** 2, -1).max())
+            scale_obj = torch.sqrt(torch.sum((pc_obj - cent_obj) ** 2, -1).max())
+            pc_hum = (pc_hum - cent_hum) / (2 * scale_hum)
+            pc_obj = (pc_obj - cent_obj) / (2 * scale_obj)
+            # Also update camera parameters for separate human + object
+            T_hum_scaled = (batch['T_ho'][i] + cent_hum.squeeze(0)) / (2 * scale_hum)
+            T_obj_scaled = (batch['T_ho'][i] + cent_obj.squeeze(0)) / (2 * scale_obj)
+            pred_hum.append(pc_hum)
+            pred_obj.append(pc_obj)
+            cent_hum_pred.append(cent_hum.squeeze(0))
+            cent_obj_pred.append(cent_obj.squeeze(0))
+            T_hum.append(T_hum_scaled * torch.tensor([-1, -1, 1]))  # apply opencv to pytorch3d transform: flip x and y
+            T_obj.append(T_obj_scaled * torch.tensor([-1, -1, 1]))
+            radius_hum_pred.append(scale_hum)
+            radius_obj_pred.append(scale_obj)
+        # Pack data into a new batch dict
+        camera_hum = PerspectiveCameras(
+            R=torch.stack(batch['R']),
+            T=torch.stack(T_hum),
+            K=torch.stack(batch['K_hum']),
+            device='cuda',
+            in_ndc=True
+        )
+        camera_obj = PerspectiveCameras(
+            R=torch.stack(batch['R']),
+            T=torch.stack(T_obj),
+            K=torch.stack(batch['K_obj']),  # the camera should be human/object specific!!!
+            device='cuda',
+            in_ndc=True
+        )
+        # use pc from predicted
+        pc_hum = Pointclouds([x.to('cuda') for x in pred_hum])
+        pc_obj = Pointclouds([x.to('cuda') for x in pred_obj])
+        # use center and radius from predicted
+        cent_hum = torch.stack(cent_hum_pred, 0).to('cuda')
+        cent_obj = torch.stack(cent_obj_pred, 0).to('cuda')  # B, 3
+        radius_hum = torch.stack(radius_hum_pred, 0).to('cuda')  # B, 1
+        radius_obj = torch.stack(radius_obj_pred, 0).to('cuda')
+        out_stage2: Pointclouds = self.model_stage2.forward_sample(
+            num_points=num_samples,
+            camera=camera_hum,
+            image_rgb=torch.stack(batch['images_hum'], 0).to('cuda'),
+            mask=torch.stack(batch['masks_hum'], 0).to('cuda'),
+            gt_pc=pc_hum,
+            rgb_obj=torch.stack(batch['images_obj'], 0).to('cuda'),
+            mask_obj=torch.stack(batch['masks_obj'], 0).to('cuda'),
+            pc_obj=pc_obj,
+            camera_obj=camera_obj,
+            cent_hum=cent_hum,
+            cent_obj=cent_obj,
+            radius_hum=radius_hum.unsqueeze(-1),
+            radius_obj=radius_obj.unsqueeze(-1),
+            sample_from_interm=True,
+            noise_step=cfg.run.sample_noise_step)
+        return out_stage1, out_stage2
+    def upsample_predicted_pc(self, num_samples, pc_obj):
+        """
+        Up/Downsample the points to given number
+        :param num_samples: the target number
+        :param pc_obj: (N, 3)
+        :return: (num_samples, 3)
+        """
+        if len(pc_obj) > num_samples:
+            ind_obj = np.random.choice(len(pc_obj), num_samples)
+        else:
+            ind_obj = np.concatenate([np.arange(len(pc_obj)), np.random.choice(len(pc_obj), num_samples - len(pc_obj))])
+        pc_obj = pc_obj.clone()[torch.from_numpy(ind_obj).long().to(pc_obj.device)]
+        return pc_obj
+@hydra.main(config_path='configs', config_name='configs', version_base='1.1')
+def main(cfg: ProjectConfig):
+    runner = DemoRunner(cfg)
+    runner.run()
+if __name__ == '__main__':
+    main()

diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import math
+from typing import List, Optional, Sequence, Union
+import imageio
+import logging
+import numpy as np
+import torch
+import torch.utils.data
+from PIL import Image
+from torch.distributions import Normal
+from torchvision.transforms.functional import to_pil_image
+from torchvision.utils import make_grid
+from tqdm import tqdm, trange
+from pytorch3d.renderer import (
+    AlphaCompositor,
+    NormWeightedCompositor,
+    OrthographicCameras,
+    PointsRasterizationSettings,
+    PointsRasterizer,
+    PointsRenderer,
+    look_at_view_transform)
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.structures import Pointclouds
+from pytorch3d.structures.pointclouds import join_pointclouds_as_batch
+# Disable unnecessary imageio logging
+logging.getLogger("imageio_ffmpeg").setLevel(logging.ERROR)
+def rotation_matrix(axis, theta):
+    """
+    Return the rotation matrix associated with counterclockwise rotation about
+    the given axis by theta radians.
+    """
+    axis = np.asarray(axis)
+    axis = axis / np.sqrt(np.dot(axis, axis))
+    a = np.cos(theta / 2.0)
+    b, c, d = -axis * np.sin(theta / 2.0)
+    aa, bb, cc, dd = a * a, b * b, c * c, d * d
+    bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d
+    return np.array([[aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)],
+                     [2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)],
+                     [2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc]])
+def rotate(vertices, faces):
+    '''
+    vertices: [numpoints, 3]
+    '''
+    M = rotation_matrix([0, 1, 0], np.pi / 2).transpose()
+    N = rotation_matrix([1, 0, 0], -np.pi / 4).transpose()
+    K = rotation_matrix([0, 0, 1], np.pi).transpose()
+    v, f = vertices[:, [1, 2, 0]].dot(M).dot(N).dot(K), faces[:, [1, 2, 0]]
+    return v, f
+def norm(v, f):
+    v = (v - v.min()) / (v.max() - v.min()) - 0.5
+    return v, f
+def getGradNorm(net):
+    pNorm = torch.sqrt(sum(torch.sum(p ** 2) for p in net.parameters()))
+    gradNorm = torch.sqrt(sum(torch.sum(p.grad ** 2) for p in net.parameters()))
+    return pNorm, gradNorm
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1 and m.weight is not None:
+        torch.nn.init.xavier_normal_(m.weight)
+    elif classname.find('BatchNorm') != -1:
+        m.weight.data.normal_()
+        m.bias.data.fill_(0)
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    # Assumes data is integers [0, 1]
+    assert x.shape == means.shape == log_scales.shape
+    px0 = Normal(torch.zeros_like(means), torch.ones_like(log_scales))
+    centered_x = x - means
+    inv_stdv = torch.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 0.5)
+    cdf_plus = px0.cdf(plus_in)
+    min_in = inv_stdv * (centered_x - .5)
+    cdf_min = px0.cdf(min_in)
+    log_cdf_plus = torch.log(torch.max(cdf_plus, torch.ones_like(cdf_plus) * 1e-12))
+    log_one_minus_cdf_min = torch.log(torch.max(1. - cdf_min, torch.ones_like(cdf_min) * 1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = torch.where(
+        x < 0.001, log_cdf_plus,
+        torch.where(x > 0.999, log_one_minus_cdf_min,
+                    torch.log(torch.max(cdf_delta, torch.ones_like(cdf_delta) * 1e-12))))
+    assert log_probs.shape == x.shape
+    return log_probs
+def fig2img(fig):
+    """Convert a Matplotlib figure to a PIL Image and return it"""
+    import io
+    buf = io.BytesIO()
+    fig.savefig(buf)
+    buf.seek(0)
+    img = Image.open(buf)
+    return img
+@torch.no_grad()
+def visualize_distance_transform(
+    path_stem: str,
+    images: torch.Tensor,
+) -> str:
+    output_file_image = f'{path_stem}.png'
+    if images.shape[3] in [1, 3]:  # convert to (B, C, H, W)
+        images = images.permute(0, 3, 1, 2)
+    images = images[:, -1:]  # (B, 1, H, W)  # get only distances (not vectors for now, for simplicity)
+    image_grid = make_grid(images, nrow=int(math.sqrt(len(images))), pad_value=1, normalize=True)
+    to_pil_image(image_grid).save(output_file_image)
+    return output_file_image
+@torch.no_grad()
+def visualize_image(
+    path_stem: str,
+    images: torch.Tensor,
+    mean: Union[torch.Tensor, float] = 0.5,
+    std: Union[torch.Tensor, float] = 0.5,
+) -> str:
+    output_file_image = f'{path_stem}.png'
+    if images.shape[3] in [1, 3, 4]:  # convert to (B, C, H, W)
+        images = images.permute(0, 3, 1, 2)
+    if images.shape[1] in [3, 4]:  # normalize (single-channel images are not normalized)
+        images[:, :3] = images[:, :3] * std + mean  # denormalize (color channels only, not alpha channel)
+    if images.shape[1] == 4:  # normalize (single-channel images are not normalized)
+        image_alpha = images[:, 3:]  # (B, 1, H, W)
+        bg_color = torch.tensor([230, 220, 250], device=images.device).reshape(1, 3, 1, 1) / 255
+        images = images[:, :3] * image_alpha + bg_color * (1 - image_alpha)  # (B, 3, H, W)
+    image_grid = make_grid(images, nrow=int(math.sqrt(len(images))), pad_value=1)
+    to_pil_image(image_grid).save(output_file_image)
+    return output_file_image
+def ensure_point_cloud_has_colors(pointcloud: Pointclouds):
+    if pointcloud.features_padded() is None:
+        pointcloud = type(pointcloud)(points=pointcloud.points_padded(),
+            normals=pointcloud.normals_padded(), features=torch.zeros_like(pointcloud.points_padded()))
+    return pointcloud
+@torch.no_grad()
+def render_pointcloud_batch_pytorch3d(
+    cameras: CamerasBase,
+    pointclouds: Pointclouds,
+    image_size: int = 224,
+    radius: float = 0.01,
+    points_per_pixel: int = 10,
+    background_color: Sequence[float] = (0.78431373, 0.78431373, 0.78431373),
+    compositor: str = 'norm_weighted'
+):
+    # Define the settings for rasterization and shading. Here we set the output image to be of size
+    # 512x512. As we are rendering images for visualization purposes only we will set faces_per_pixel=1
+    # and blur_radius=0.0. Refer to rasterize_points.py for explanations of these parameters.
+    raster_settings = PointsRasterizationSettings(
+        image_size=image_size,
+        radius=radius,
+        points_per_pixel=points_per_pixel,
+    )
+    # Rasterizer
+    rasterizer = PointsRasterizer(cameras=cameras, raster_settings=raster_settings)
+    # Compositor
+    if compositor == 'alpha':
+        compositor = AlphaCompositor(background_color=background_color)
+    elif compositor == 'norm_weighted':
+        compositor = NormWeightedCompositor(background_color=background_color)
+    else:
+        raise ValueError(compositor)
+    # Create a points renderer by compositing points using an weighted compositor (3D points are
+    # weighted according to their distance to a pixel and accumulated using a weighted sum)
+    renderer = PointsRenderer(rasterizer=rasterizer, compositor=compositor)
+    # We cannot render a point cloud without colors, so add them if the pointcloud does
+    # not already have them
+    pointclouds = ensure_point_cloud_has_colors(pointclouds)
+    # Render batch of image
+    images = renderer(pointclouds)
+    return images
+@torch.no_grad()
+def visualize_pointcloud_batch_pytorch3d(
+    pointclouds: Pointclouds,
+    output_file_video: Optional[str] = None,
+    output_file_image: Optional[str] = None,
+    cameras: Optional[CamerasBase] = None,  # if None, we rotate
+    scale_factor: float = 1.0,
+    num_frames: int = 1,  # note that it takes a while with 30 * batch_size frames
+    elev: int = 30,
+):
+    """Saves a video and a single image of a point cloud"""
+    assert 360 % num_frames == 0, 'please select a better number of frames'
+    # Sizes
+    B, N, C, F = *(pointclouds.points_padded().shape), num_frames
+    device = pointclouds.device
+    # If a camera has not been provided, we render from a rotating view around an image
+    if cameras is None:
+        # Create view transforms - R is (F, 3, 3) and T is (F, 3)
+        R, T = look_at_view_transform(dist=10.0, elev=elev, azim=list(range(0, 360, 360 // F)), degrees=True, device=device)
+        # Repeat
+        R = R.repeat_interleave(B, dim=0)  # (F * B, 3, 3)
+        T = T.repeat_interleave(B, dim=0)  # (F * B, 3)
+        points = pointclouds.points_padded().tile(F, 1, 1)  # (F * B, num_points, 3)
+        colors = (torch.zeros_like(points) if pointclouds.features_padded() is None else
+                  pointclouds.features_padded().tile(F, 1, 1))  # (F * B, num_points, 3)
+        # Initialize batch of cameras
+        cameras = OrthographicCameras(focal_length=(0.25 * scale_factor), device=device, R=R, T=T)
+        # Wrap in Pointclouds (with color, even if the original point cloud had no color)
+        pointclouds = Pointclouds(points=points, features=colors).to(device)
+    # Render image
+    images = render_pointcloud_batch_pytorch3d(cameras, pointclouds)
+    # Convert images into grid
+    image_grids = []
+    images_for_grids = images.reshape(F, B, *images.shape[1:]).permute(0, 1, 4, 2, 3)
+    for image_for_grids in images_for_grids:
+        image_grid = make_grid(image_for_grids, nrow=int(math.sqrt(B)), pad_value=1)
+        image_grids.append(image_grid)
+    image_grids = torch.stack(image_grids, dim=0)
+    image_grids = image_grids.detach().cpu()
+    # Save image
+    if output_file_image is not None:
+        to_pil_image(image_grids[0]).save(output_file_image)
+    # Save video
+    if output_file_video:
+        video = (image_grids * 255).permute(0, 2, 3, 1).to(torch.uint8).numpy()
+        imageio.mimwrite(output_file_video, video, fps=10)
+@torch.no_grad()
+def visualize_pointcloud_evolution_pytorch3d(
+    pointclouds: Pointclouds,
+    output_file_video: str,
+    camera: Optional[CamerasBase] = None,  # if None, we rotate
+    scale_factor: float = 1.0,
+):
+    # Device
+    B, device = len(pointclouds), pointclouds.device
+    # Cameras
+    if camera is None:
+        R, T = look_at_view_transform(dist=10.0, elev=30, azim=0, device=device)
+        camera = OrthographicCameras(focal_length=(0.25 * scale_factor), device=device, R=R, T=T)
+    # Render
+    frames = render_pointcloud_batch_pytorch3d(camera, pointclouds)
+    # Save video
+    video = (frames.detach().cpu() * 255).to(torch.uint8).numpy()
+    imageio.mimwrite(output_file_video, video, fps=10)
+def get_camera_index(cameras: CamerasBase, index: Optional[int] = None):
+    if index is None:
+        return cameras
+    kwargs = dict(
+        R=cameras.R[index].unsqueeze(0),
+        T=cameras.T[index].unsqueeze(0),
+        K=cameras.K[index].unsqueeze(0) if cameras.K is not None else None,
+    )
+    if hasattr(cameras, 'focal_length'):
+        kwargs['focal_length'] = cameras.focal_length[index].unsqueeze(0)
+    if hasattr(cameras, 'principal_point'):
+        kwargs['principal_point'] = cameras.principal_point[index].unsqueeze(0)
+    return type(cameras)(**kwargs).to(cameras.device)
+def get_metadata(item) -> str:
+    s = '-------------\n'
+    for key in item.keys():
+        value = item[key]
+        if torch.is_tensor(value) and value.numel() < 25:
+            value_str = value
+        elif torch.is_tensor(value):
+            value_str = value.shape
+        elif isinstance(value, str):
+            value_str = value
+        elif isinstance(value, list) and 0 < len(value) and len(value) < 25 and isinstance(value[0], str):
+            value_str = value
+        elif isinstance(value, dict):
+            value_str = str({k: type(v) for k, v in value.items()})
+        else:
+            value_str = type(value)
+        s += f"{key:<30} {value_str}\n"
+    return s

examples/017450/k1.color.jpg ADDED Viewed

examples/017450/k1.obj_rend_mask.png ADDED Viewed

examples/017450/k1.person_mask.png ADDED Viewed

model/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from configs.structured import ProjectConfig
+from .model import ConditionalPointCloudDiffusionModel
+from .model_coloring import PointCloudColoringModel
+from .model_utils import set_requires_grad
+from .model_diff_data import ConditionalPCDiffusionSeparateSegm
+from .model_hoattn import CrossAttenHODiffusionModel
+def get_model(cfg: ProjectConfig):
+    if cfg.model.model_name == 'pc2-diff':
+        model = ConditionalPointCloudDiffusionModel(**cfg.model)
+    elif cfg.model.model_name == 'pc2-diff-ho-sepsegm':
+        model = ConditionalPCDiffusionSeparateSegm(**cfg.model)
+        print("Using a separate model to predict segmentation label")
+    elif cfg.model.model_name == 'diff-ho-attn':
+        model = CrossAttenHODiffusionModel(**cfg.model)
+        print("Using separate model for human + object with cross attention.")
+    else:
+        raise NotImplementedError
+    if cfg.run.freeze_feature_model:
+        set_requires_grad(model.feature_model, False)
+    return model
+def get_coloring_model(cfg: ProjectConfig):
+    model = PointCloudColoringModel(**cfg.model)
+    if cfg.run.freeze_feature_model:
+        set_requires_grad(model.feature_model, False)
+    return model

model/feature_model.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers import ModelMixin
+from timm.models.vision_transformer import VisionTransformer, resize_pos_embed
+from torch import Tensor
+from torchvision.transforms import functional as TVF
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+MODEL_URLS = {
+    'vit_base_patch16_224_mae': 'https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth',
+    'vit_small_patch16_224_msn': 'https://dl.fbaipublicfiles.com/msn/vits16_800ep.pth.tar',
+    'vit_large_patch7_224_msn': 'https://dl.fbaipublicfiles.com/msn/vitl7_200ep.pth.tar',
+}
+NORMALIZATION = {
+    'vit_base_patch16_224_mae': (IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+    'vit_small_patch16_224_msn': (IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+    'vit_large_patch7_224_msn': (IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+}
+MODEL_KWARGS = {
+    'vit_base_patch16_224_mae': dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12,
+    ),
+    'vit_small_patch16_224_msn': dict(
+        patch_size=16, embed_dim=384, depth=12, num_heads=6,
+    ),
+    'vit_large_patch7_224_msn': dict(
+        patch_size=7, embed_dim=1024, depth=24, num_heads=16,
+    )
+}
+class FeatureModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        image_size: int = 224,
+        model_name: str = 'vit_small_patch16_224_mae',
+        global_pool: str = '',  # '' or 'token'
+    ) -> None:
+        super().__init__()
+        self.model_name = model_name
+        # Identity
+        if self.model_name == 'identity':
+            return
+        # Create model
+        self.model = VisionTransformer(
+            img_size=image_size, num_classes=0, global_pool=global_pool,
+            **MODEL_KWARGS[model_name])
+        # Model properties
+        self.feature_dim = self.model.embed_dim
+        self.mean, self.std = NORMALIZATION[model_name]
+        # # Modify MSN model with output head from training
+        # if model_name.endswith('msn'):
+        #     use_bn = True
+        #     emb_dim = (192 if 'tiny' in model_name else 384 if 'small' in model_name else
+        #         768 if 'base' in model_name else 1024 if 'large' in model_name else 1280)
+        #     hidden_dim = 2048
+        #     output_dim = 256
+        #     self.model.fc = None
+        #     fc = OrderedDict([])
+        #     fc['fc1'] = torch.nn.Linear(emb_dim, hidden_dim)
+        #     if use_bn:
+        #         fc['bn1'] = torch.nn.BatchNorm1d(hidden_dim)
+        #     fc['gelu1'] = torch.nn.GELU()
+        #     fc['fc2'] = torch.nn.Linear(hidden_dim, hidden_dim)
+        #     if use_bn:
+        #         fc['bn2'] = torch.nn.BatchNorm1d(hidden_dim)
+        #     fc['gelu2'] = torch.nn.GELU()
+        #     fc['fc3'] = torch.nn.Linear(hidden_dim, output_dim)
+        #     self.model.fc = torch.nn.Sequential(fc)
+        # Load pretrained checkpoint
+        checkpoint = torch.hub.load_state_dict_from_url(MODEL_URLS[model_name])
+        if 'model' in checkpoint:
+            state_dict = checkpoint['model']
+        elif 'target_encoder' in checkpoint:
+            state_dict = checkpoint['target_encoder']
+            state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
+            # NOTE: Comment the line below if using the projection head, uncomment if not using it
+            # See https://github.com/facebookresearch/msn/blob/81cb855006f41cd993fbaad4b6a6efbb486488e6/src/msn_train.py#L490-L502
+            # for more info about the projection head
+            state_dict = {k: v for k, v in state_dict.items() if not k.startswith('fc.')}
+        else:
+            raise NotImplementedError()
+        state_dict['pos_embed'] = resize_pos_embed(state_dict['pos_embed'], self.model.pos_embed)
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+        # # Modify MSN model with output head from training
+        # if model_name.endswith('msn'):
+        #     self.fc = self.model.fc
+        #     del self.model.fc
+        # else:
+        #     self.fc = nn.Identity()
+        # NOTE: I've disabled the whole projection head stuff for simplicity for now
+        self.fc = nn.Identity()
+    def denormalize(self, img: Tensor):
+        img = TVF.normalize(img, mean=[-m/s for m, s in zip(self.mean, self.std)], std=[1/s for s in self.std])
+        return torch.clip(img, 0, 1)
+    def normalize(self, img: Tensor):
+        return TVF.normalize(img, mean=self.mean, std=self.std)
+    def forward(
+        self,
+        x: Tensor,
+        return_type: str = 'features',
+        return_upscaled_features: bool = True,
+        return_projection_head_output: bool = False,
+    ):
+        """Normalizes the input `x` and runs it through `model` to obtain features"""
+        assert return_type in {'cls_token', 'features', 'all'}
+        # Identity
+        if self.model_name == 'identity':
+            return x
+        # Normalize and forward
+        B, C, H, W = x.shape
+        x = self.normalize(x)
+        feats = self.model(x)
+        # Reshape to image-like size
+        if return_type in {'features', 'all'}:
+            B, T, D = feats.shape
+            assert math.sqrt(T - 1).is_integer()
+            HW_down = int(math.sqrt(T - 1))  # subtract one for CLS token
+            output_feats: Tensor = feats[:, 1:, :].reshape(B, HW_down, HW_down, D).permute(0, 3, 1, 2)  # (B, D, H_down, W_down)
+            if return_upscaled_features:
+                output_feats = F.interpolate(output_feats, size=(H, W), mode='bilinear',
+                    align_corners=False)  # (B, D, H_orig, W_orig)
+        # Head for MSN
+        output_cls = feats[:, 0]
+        if return_projection_head_output and return_type in {'cls_token', 'all'}:
+            output_cls = self.fc(output_cls)
+        # Return
+        if return_type == 'cls_token':
+            return output_cls
+        elif return_type == 'features':
+            return output_feats
+        else:
+            return output_cls, output_feats

model/model.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import inspect
+import random
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+from diffusers.schedulers.scheduling_ddim import DDIMScheduler
+from diffusers.schedulers.scheduling_pndm import PNDMScheduler
+from pytorch3d.implicitron.dataset.data_loader_map_provider import FrameData
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.structures import Pointclouds
+from torch import Tensor
+from tqdm import tqdm
+from .model_utils import get_num_points, get_custom_betas
+from .point_cloud_model import PointCloudModel
+from .projection_model import PointCloudProjectionModel
+class ConditionalPointCloudDiffusionModel(PointCloudProjectionModel):
+    def __init__(
+        self,
+        beta_start: float,
+        beta_end: float,
+        beta_schedule: str,
+        point_cloud_model: str,
+        point_cloud_model_embed_dim: int,
+        **kwargs,  # projection arguments
+    ):
+        super().__init__(**kwargs)
+        # Checks
+        if not self.predict_shape:
+            raise NotImplementedError('Must predict shape if performing diffusion.')
+        # Create diffusion model schedulers which define the sampling timesteps
+        self.dm_pred_type = kwargs.get('dm_pred_type', "epsilon")
+        assert self.dm_pred_type in ['epsilon','sample']
+        scheduler_kwargs = {"prediction_type": self.dm_pred_type}
+        if beta_schedule == 'custom':
+            scheduler_kwargs.update(dict(trained_betas=get_custom_betas(beta_start=beta_start, beta_end=beta_end)))
+        else:
+            scheduler_kwargs.update(dict(beta_start=beta_start, beta_end=beta_end, beta_schedule=beta_schedule))
+        self.schedulers_map = {
+            'ddpm': DDPMScheduler(**scheduler_kwargs, clip_sample=False),
+            'ddim': DDIMScheduler(**scheduler_kwargs, clip_sample=False),
+            'pndm': PNDMScheduler(**scheduler_kwargs),
+        }
+        self.scheduler = self.schedulers_map['ddpm']  # this can be changed for inference
+        # Create point cloud model for processing point cloud at each diffusion step
+        self.init_pcloud_model(kwargs, point_cloud_model, point_cloud_model_embed_dim)
+        self.load_sample_init = kwargs.get('load_sample_init', False)
+        self.sample_init_scale = kwargs.get('sample_init_scale', 1.0)
+        self.test_init_with_gtpc = kwargs.get('test_init_with_gtpc', False)
+        self.consistent_center = kwargs.get('consistent_center', False)
+        self.cam_noise_std = kwargs.get('cam_noise_std', 0.0) # add noise to camera based on timestamps
+    def init_pcloud_model(self, kwargs, point_cloud_model, point_cloud_model_embed_dim):
+        self.point_cloud_model = PointCloudModel(
+            model_type=point_cloud_model,
+            embed_dim=point_cloud_model_embed_dim,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,  # voxel resolution multiplier is 1.
+            voxel_resolution_multiplier=kwargs.get('voxel_resolution_multiplier', 1)
+        )
+    def forward_train(
+        self,
+        pc: Pointclouds,
+        camera: Optional[CamerasBase],
+        image_rgb: Optional[Tensor],
+        mask: Optional[Tensor],
+        return_intermediate_steps: bool = False,
+        **kwargs
+    ):
+        # Normalize colors and convert to tensor
+        x_0 = self.point_cloud_to_tensor(pc, normalize=True, scale=True) # this will not pack the point colors
+        B, N, D = x_0.shape
+        # Sample random noise
+        noise = torch.randn_like(x_0)
+        if self.consistent_center:
+            # modification suggested by https://arxiv.org/pdf/2308.07837.pdf
+            noise = noise - torch.mean(noise, dim=1, keepdim=True)
+        # Sample random timesteps for each point_cloud
+        timestep = torch.randint(0, self.scheduler.num_train_timesteps, (B,),
+            device=self.device, dtype=torch.long)
+        # Add noise to points
+        x_t = self.scheduler.add_noise(x_0, noise, timestep) # diffusion noisy adding, only add to the coordinate, not features
+        # add noise to the camera pose, based on timestamps
+        if self.cam_noise_std > 0.000001:
+            # the noise is very different
+            camera = camera.clone()
+            camT = camera.T # (B, 3)
+            dist = torch.sqrt(torch.sum(camT**2, -1, keepdim=True))
+            nratio = timestep[:, None] / self.scheduler.num_train_timesteps # time-dependent noise
+            tnoise = torch.randn(B, 3).to(dist.device)/3. * dist * self.cam_noise_std * nratio
+            camera.T = camera.T + tnoise
+        # Conditioning, the pixel-aligned feature is based on points with noise (new points)
+        x_t_input = self.get_diffu_input(camera, image_rgb, mask, timestep, x_t, **kwargs)
+        # Forward
+        loss, noise_pred = self.compute_loss(noise, timestep, x_0, x_t_input)
+        # Whether to return intermediate steps
+        if return_intermediate_steps:
+            return loss, (x_0, x_t, noise, noise_pred)
+        return loss
+    def compute_loss(self, noise, timestep, x_0, x_t_input):
+        x_pred = torch.zeros_like(x_0)
+        if self.self_conditioning:
+            # self conditioning, from https://openreview.net/pdf?id=3itjR9QxFw
+            if random.uniform(0, 1.) > 0.5:
+                with torch.no_grad():
+                    x_pred = self.point_cloud_model(torch.cat([x_t_input, x_pred], -1), timestep)
+            noise_pred = self.point_cloud_model(torch.cat([x_t_input, x_pred], -1), timestep)
+        else:
+            noise_pred = self.point_cloud_model(x_t_input, timestep)
+        # Check
+        if not noise_pred.shape == noise.shape:
+            raise ValueError(f'{noise_pred.shape=} and {noise.shape=}')
+        # Loss
+        if self.dm_pred_type == 'epsilon':
+            loss = F.mse_loss(noise_pred, noise)
+        elif self.dm_pred_type == 'sample':
+            loss = F.mse_loss(noise_pred, x_0)  # predicting sample
+        else:
+            raise NotImplementedError
+        return loss, noise_pred
+    def get_diffu_input(self, camera, image_rgb, mask, timestep, x_t, **kwargs):
+        "return: (B, N, D), the exact input to the diffusion model, x_t: (B, N, 3)"
+        x_t_input = self.get_input_with_conditioning(x_t, camera=camera,
+                                                     image_rgb=image_rgb, mask=mask, t=timestep)
+        return x_t_input
+    @torch.no_grad()
+    def forward_sample(
+        self,
+        num_points: int,
+        camera: Optional[CamerasBase],
+        image_rgb: Optional[Tensor],
+        mask: Optional[Tensor],
+        # Optional overrides
+        scheduler: Optional[str] = 'ddpm',
+        # Inference parameters
+        num_inference_steps: Optional[int] = 1000,
+        eta: Optional[float] = 0.0,  # for DDIM
+        # Whether to return all the intermediate steps in generation
+        return_sample_every_n_steps: int = -1,
+        # Whether to disable tqdm
+        disable_tqdm: bool = False,
+        gt_pc: Pointclouds = None,
+            **kwargs
+    ):
+        # Get scheduler from mapping, or use self.scheduler if None
+        scheduler = self.scheduler if scheduler is None else self.schedulers_map[scheduler]
+        # Get the size of the noise
+        N = num_points
+        B = 1 if image_rgb is None else image_rgb.shape[0]
+        D = self.get_x_T_channel()
+        device = self.device if image_rgb is None else image_rgb.device
+        sample_from_interm = kwargs.get('sample_from_interm', False)
+        interm_steps = kwargs.get('noise_step') if sample_from_interm else -1
+        x_t = self.initialize_x_T(device, gt_pc, (B, N, D), interm_steps, scheduler)
+        x_pred = torch.zeros_like(x_t)
+        # Set timesteps
+        extra_step_kwargs = self.setup_reverse_process(eta, num_inference_steps, scheduler)
+        # Loop over timesteps
+        all_outputs = []
+        return_all_outputs = (return_sample_every_n_steps > 0)
+        progress_bar = tqdm(scheduler.timesteps.to(device), desc=f'Sampling ({x_t.shape})', disable=disable_tqdm)
+        for i, t in enumerate(progress_bar):
+            add_interm_output = (return_all_outputs and (
+                        i % return_sample_every_n_steps == 0 or i == len(scheduler.timesteps) - 1))
+            # Conditioning
+            x_t_input = self.get_diffu_input(camera, image_rgb, mask, t, x_t, **kwargs)
+            if self.self_conditioning:
+                x_t_input = torch.cat([x_t_input, x_pred], -1) # add self-conditioning
+            inference_binary = (i == len(progress_bar) - 1) | add_interm_output
+            # One reverse step with conditioning
+            x_t = self.reverse_step(extra_step_kwargs, scheduler, t, x_t, x_t_input,
+                                    inference_binary=inference_binary) # (B, N, D), D=3 or 4
+            x_pred = x_t # for next iteration self conditioning
+            # Append to output list if desired
+            if add_interm_output:
+                all_outputs.append(x_t)
+        # Convert output back into a point cloud, undoing normalization and scaling
+        output = self.tensor_to_point_cloud(x_t, denormalize=True, unscale=True) # this convert the points back to original scale
+        if return_all_outputs:
+            all_outputs = torch.stack(all_outputs, dim=1)  # (B, sample_steps, N, D)
+            all_outputs = [self.tensor_to_point_cloud(o, denormalize=True, unscale=True) for o in all_outputs]
+        return (output, all_outputs) if return_all_outputs else output
+    def get_x_T_channel(self):
+        D = 3 + (self.color_channels if self.predict_color else 0)
+        return D
+    def initialize_x_T(self, device, gt_pc, shape, interm_steps:int=-1, scheduler=None):
+        B, N, D = shape
+        # Sample noise initialization
+        if interm_steps > 0:
+            # Sample from some intermediate steps
+            x_0 = self.point_cloud_to_tensor(gt_pc, normalize=True, scale=True)
+            noise = torch.randn(B, N, D, device=device)
+            # always make sure the noise does not change the pc center, this is important to reduce 0.1cm CD!
+            noise = noise - torch.mean(noise, dim=1, keepdim=True)
+            x_t = scheduler.add_noise(x_0, noise, torch.tensor([interm_steps - 1] * B).long().to(device))  # Add noise
+        else:
+            # Sample from random Gaussian
+            x_t = torch.randn(B, N, D, device=device)
+        x_t = x_t * self.sample_init_scale  # for test
+        if self.consistent_center:
+            x_t = x_t - torch.mean(x_t, dim=1, keepdim=True)
+        return x_t
+    def reverse_step(self, extra_step_kwargs, scheduler, t, x_t, x_t_input, **kwargs):
+        """
+        run one reverse step to compute x_t
+        :param extra_step_kwargs:
+        :param scheduler:
+        :param t: [1], diffusion time step
+        :param x_t: (B, N, 3)
+        :param x_t_input: conditional features (B, N, F)
+        :param kwargs: other configurations to run diffusion step
+        :return: denoised x_t
+        """
+        B = x_t.shape[0]
+        # Forward
+        noise_pred = self.point_cloud_model(x_t_input, t.reshape(1).expand(B))
+        if self.consistent_center:
+            assert self.dm_pred_type != 'sample', 'incompatible dm predition type for CCD!'
+            # suggested by the CCD-3DR paper
+            noise_pred = noise_pred - torch.mean(noise_pred, dim=1, keepdim=True)
+        # Step
+        x_t = scheduler.step(noise_pred, t, x_t, **extra_step_kwargs).prev_sample
+        if self.consistent_center:
+            x_t = x_t - torch.mean(x_t, dim=1, keepdim=True)
+        return x_t
+    def setup_reverse_process(self, eta, num_inference_steps, scheduler):
+        """
+        setup diffusion chain, and others.
+        """
+        accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {"offset": 1} if accepts_offset else {}
+        scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+        # Prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys())
+        extra_step_kwargs = {"eta": eta} if accepts_eta else {}
+        return extra_step_kwargs
+    def forward(self, batch: FrameData, mode: str = 'train', **kwargs):
+        """
+        A wrapper around the forward method for training and inference
+        """
+        if isinstance(batch, dict):  # fixes a bug with multiprocessing where batch becomes a dict
+            batch = FrameData(**batch)  # it really makes no sense, I do not understand it
+        if mode == 'train':
+            return self.forward_train(
+                pc=batch.sequence_point_cloud,
+                camera=batch.camera,
+                image_rgb=batch.image_rgb,
+                mask=batch.fg_probability,
+                **kwargs)
+        elif mode == 'sample':
+            num_points = kwargs.pop('num_points', get_num_points(batch.sequence_point_cloud))
+            return self.forward_sample(
+                num_points=num_points,
+                camera=batch.camera,
+                image_rgb=batch.image_rgb,
+                mask=batch.fg_probability,
+                **kwargs)
+        else:
+            raise NotImplementedError()

model/model_coloring.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from typing import Optional
+import torch
+import torch.nn.functional as F
+from pytorch3d.implicitron.dataset.data_loader_map_provider import FrameData
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.structures import Pointclouds
+from torch import Tensor
+from .point_cloud_transformer_model import PointCloudTransformerModel
+from .projection_model import PointCloudProjectionModel
+class PointCloudColoringModel(PointCloudProjectionModel):
+    def __init__(
+        self,
+        point_cloud_model: str,
+        point_cloud_model_layers: int,
+        point_cloud_model_embed_dim: int,
+        **kwargs,  # projection arguments
+    ):
+        super().__init__(**kwargs)
+        # Checks
+        if self.predict_shape or not self.predict_color:
+            raise NotImplementedError('Must predict color, not shape, for coloring')
+        # Create point cloud model for processing point cloud
+        self.point_cloud_model = PointCloudTransformerModel(
+            num_layers=point_cloud_model_layers,
+            model_type=point_cloud_model,
+            embed_dim=point_cloud_model_embed_dim,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+        ) # why use transformer instead???
+    def _forward(
+        self,
+        pc: Pointclouds,
+        camera: Optional[CamerasBase],
+        image_rgb: Optional[Tensor],
+        mask: Optional[Tensor],
+        return_point_cloud: bool = False,
+        noise_std: float = 0.0,
+    ):
+        # Normalize colors and convert to tensor
+        x = self.point_cloud_to_tensor(pc, normalize=True, scale=True)
+        x_points, x_colors = x[:, :, :3], x[:, :, 3:]
+        # Add noise to points. TODO: Add to configs.
+        x_input = x_points + torch.randn_like(x_points) * noise_std # simulate noise of the predicted pc?
+        # Conditioning
+        # x_input = self.get_input_with_conditioning(x_input, camera=camera,
+        #     image_rgb=image_rgb, mask=mask)
+        # XH: edit to run
+        x_input = self.get_input_with_conditioning(x_input, camera=camera,
+                                                   image_rgb=image_rgb, mask=mask, t=None)
+        # Forward
+        pred_colors = self.point_cloud_model(x_input)
+        # During inference, we return the point cloud with the predicted colors
+        if return_point_cloud:
+            pred_pointcloud = self.tensor_to_point_cloud(
+                torch.cat((x_points, pred_colors), dim=2), denormalize=True, unscale=True)
+            return pred_pointcloud
+        # During training, we have ground truth colors and return the loss
+        loss = F.mse_loss(pred_colors, x_colors)
+        return loss
+    def forward(self, batch: FrameData, **kwargs):
+        """A wrapper around the forward method"""
+        if isinstance(batch, dict):  # fixes a bug with multiprocessing where batch becomes a dict
+            batch = FrameData(**batch)  # it really makes no sense, I do not understand it
+        return self._forward(
+            pc=batch.sequence_point_cloud,
+            camera=batch.camera,
+            image_rgb=batch.image_rgb,
+            mask=batch.fg_probability,
+            **kwargs,
+        )

model/model_diff_data.py ADDED Viewed

	@@ -0,0 +1,238 @@

+"""
+model to deal with shapenet inputs and other datasets such as Behave and ProciGen
+the model takes a different data dictionary in forward function
+"""
+import inspect
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+from diffusers.schedulers.scheduling_ddim import DDIMScheduler
+from diffusers.schedulers.scheduling_pndm import PNDMScheduler
+from pytorch3d.implicitron.dataset.data_loader_map_provider import FrameData
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.structures import Pointclouds
+from torch import Tensor
+from tqdm import tqdm
+from pytorch3d.renderer import PerspectiveCameras
+from pytorch3d.datasets.r2n2.utils import BlenderCamera
+from .model import ConditionalPointCloudDiffusionModel
+from .model_utils import get_num_points
+class ConditionalPCDiffusionShapenet(ConditionalPointCloudDiffusionModel):
+    def forward(self, batch, mode: str = 'train', **kwargs):
+        """
+        take a batch of data from ShapeNet
+        """
+        images = torch.stack(batch['images'], 0).to('cuda')
+        masks = torch.stack(batch['masks'], 0).to('cuda')
+        pc = Pointclouds([x.to('cuda') for x in batch['pclouds']])
+        camera = BlenderCamera(
+            torch.stack(batch['R']),
+            torch.stack(batch['T']),
+            torch.stack(batch['K']), device='cuda'
+        )
+        if mode == 'train':
+            return self.forward_train(
+                pc=pc,
+                camera=camera,
+                image_rgb=images,
+                mask=masks,
+                **kwargs)
+        elif mode == 'sample':
+            num_points = kwargs.pop('num_points', get_num_points(pc))
+            return self.forward_sample(
+                num_points=num_points,
+                camera=camera,
+                image_rgb=images,
+                mask=masks,
+                gt_pc=pc,
+                **kwargs)
+        else:
+            raise NotImplementedError()
+class ConditionalPCDiffusionBehave(ConditionalPointCloudDiffusionModel):
+    "diffusion model for Behave dataset"
+    def forward(self, batch, mode: str = 'train', **kwargs):
+        images = torch.stack(batch['images'], 0).to('cuda')
+        masks = torch.stack(batch['masks'], 0).to('cuda')
+        pc = self.get_input_pc(batch)
+        camera = PerspectiveCameras(
+            R=torch.stack(batch['R']),
+            T=torch.stack(batch['T']),
+            K=torch.stack(batch['K']),
+            device='cuda',
+            in_ndc=True
+        )
+        grid_df = torch.stack(batch['grid_df'], 0).to('cuda') if 'grid_df' in batch else None
+        num_points = kwargs.pop('num_points', get_num_points(pc))
+        if mode == 'train':
+            return self.forward_train(
+                pc=pc,
+                camera=camera,
+                image_rgb=images,
+                mask=masks,
+                grid_df=grid_df,
+                **kwargs)
+        elif mode == 'sample':
+            return self.forward_sample(
+                num_points=num_points,
+                camera=camera,
+                image_rgb=images,
+                mask=masks,
+                gt_pc=pc,
+                **kwargs)
+        else:
+            raise NotImplementedError()
+    def get_input_pc(self, batch):
+        pc = Pointclouds([x.to('cuda') for x in batch['pclouds']])
+        return pc
+class ConditionalPCDiffusionSeparateSegm(ConditionalPCDiffusionBehave):
+    "a separate model to predict binary labels, the final segmentation model"
+    def __init__(self,
+                 beta_start: float,
+                 beta_end: float,
+                 beta_schedule: str,
+                 point_cloud_model: str,
+                 point_cloud_model_embed_dim: int,
+                 **kwargs,  # projection arguments
+                 ):
+        super(ConditionalPCDiffusionSeparateSegm, self).__init__(beta_start, beta_end, beta_schedule,
+                                                                 point_cloud_model,
+                                                                 point_cloud_model_embed_dim, **kwargs)
+        # add a separate model to predict binary label
+        from .point_cloud_transformer_model import PointCloudTransformerModel, PointCloudModel
+        self.binary_model = PointCloudTransformerModel(
+            num_layers=1, # XH: use the default color model number of layers
+            model_type=point_cloud_model, # pvcnn
+            embed_dim=point_cloud_model_embed_dim, # save as pc shape model
+            in_channels=self.in_channels,
+            out_channels=1,
+        )
+        self.binary_training_noise_std = kwargs.get("binary_training_noise_std", 0.1)
+        # re-initialize point cloud model
+        assert self.predict_binary
+        self.point_cloud_model = PointCloudModel(
+            model_type=point_cloud_model,
+            embed_dim=point_cloud_model_embed_dim,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels - 1,  # not predicting binary from this anymore
+            voxel_resolution_multiplier=kwargs.get('voxel_resolution_multiplier', 1)
+        )
+    def forward_train(
+        self,
+        pc: Pointclouds,
+        camera: Optional[CamerasBase],
+        image_rgb: Optional[Tensor],
+        mask: Optional[Tensor],
+        return_intermediate_steps: bool = False,
+        **kwargs
+    ):
+        # first run shape forward, then binary label forward
+        assert not return_intermediate_steps
+        assert self.predict_binary
+        loss_shape = super(ConditionalPCDiffusionSeparateSegm, self).forward_train(pc,
+                                                                                   camera,
+                                                                                   image_rgb,
+                                                                                   mask,
+                                                                                   return_intermediate_steps,
+                                                                                   **kwargs)
+        # binary label forward
+        x_0 = self.point_cloud_to_tensor(pc, normalize=True, scale=True)
+        x_points, x_colors = x_0[:, :, :3], x_0[:, :, 3:]
+        # Add noise to points.
+        x_input = x_points + torch.randn_like(x_points) * self.binary_training_noise_std # std=0.1
+        x_input = self.get_input_with_conditioning(x_input, camera=camera,
+                                                   image_rgb=image_rgb, mask=mask, t=None)
+        # Forward
+        pred_segm = self.binary_model(x_input)
+        # use compressed bits
+        df_grid = kwargs.get('grid_df', None).unsqueeze(1)  # (B, 1, resz, resy, resx)
+        points = x_points.clone().detach() / self.scale_factor * 2  # , normalize to [-1, 1]
+        points[:, :, 0], points[:, :, 2] = points[:, :, 2].clone(), points[:, :,0].clone()  # swap, make sure clone is used!
+        points = points.unsqueeze(1).unsqueeze(1)  # (B,1, 1, N, 3)
+        with torch.no_grad():
+            df_interp = F.grid_sample(df_grid, points, padding_mode='border', align_corners=True).squeeze(1).squeeze(1)  # (B, 1, 1, 1, N)
+        binary_label = df_interp[:, 0] > 0.5  # (B, 1, N)
+        binary_pred = torch.sigmoid(pred_segm.squeeze(-1))  # add a sigmoid layer
+        loss_binary = F.mse_loss(binary_pred, binary_label.float().squeeze(1).squeeze(1)) * self.lw_binary
+        loss = loss_shape + loss_binary
+        return loss, torch.tensor([loss_shape, loss_binary])
+    def reverse_step(self, extra_step_kwargs, scheduler, t, x_t, x_t_input, **kwargs):
+        "return (B, N, 4), the 4-th channel is binary label"
+        B = x_t.shape[0]
+        # Forward
+        noise_pred = self.point_cloud_model(x_t_input, t.reshape(1).expand(B))
+        if self.consistent_center:
+            assert self.dm_pred_type != 'sample', 'incompatible dm predition type!'
+            # suggested by the CCD-3DR paper
+            noise_pred = noise_pred - torch.mean(noise_pred, dim=1, keepdim=True)
+        # Step: make sure only update the shape (first 3 channels)
+        x_t = scheduler.step(noise_pred, t, x_t[:, :, :3], **extra_step_kwargs).prev_sample
+        if self.consistent_center:
+            x_t = x_t - torch.mean(x_t, dim=1, keepdim=True)
+        # also add binary prediction
+        if kwargs.get('inference_binary', False):
+            pred_segm = self.binary_model(x_t_input)
+        else:
+            pred_segm = torch.zeros_like(x_t[:, :, 0:1])
+        x_t = torch.cat([x_t, torch.sigmoid(pred_segm)], -1)
+        return x_t
+    def get_coord_feature(self, x_t):
+        x_t_input = [x_t[:, :, :3]]
+        return x_t_input
+    def tensor_to_point_cloud(self, x: Tensor, /, denormalize: bool = False, unscale: bool = False):
+        """
+        take binary label into account
+        :param self:
+        :param x: (B, N, 4), the 4th channel is the binary segmentation, 1-human, 0-object
+        :param denormalize: denormalize the per-point colors, from pc2
+        :param unscale: undo point scaling, from pc2
+        :return: pc with point colors if predict binary label or per-point color
+        """
+        points = x[:, :, :3] / (self.scale_factor if unscale else 1)
+        if self.predict_color:
+            colors = self.denormalize(x[:, :, 3:]) if denormalize else x[:, :, 3:]
+            return Pointclouds(points=points, features=colors)
+        else:
+            if self.predict_binary:
+                assert x.shape[2] == 4
+                # add color to predicted binary labels
+                is_hum = x[:, :, 3] > 0.5
+                features = []
+                for mask in is_hum:
+                    color = torch.zeros_like(x[0, :, :3]) + torch.tensor([0.5, 1.0, 0]).to(x.device)
+                    color[mask, :] = torch.tensor([0.05, 1.0, 1.0]).to(x.device) # human is light blue, object light green
+                    features.append(color)
+            else:
+                assert x.shape[2] == 3
+                features = None
+            return Pointclouds(points=points, features=features)

model/model_hoattn.py ADDED Viewed

	@@ -0,0 +1,457 @@

+"""
+model that use cross attention to predict human + object
+"""
+import inspect
+import random
+from typing import Optional
+from torch import Tensor
+import torch
+import numpy as np
+from pytorch3d.structures import Pointclouds
+from pytorch3d.renderer import CamerasBase
+from .model_diff_data import ConditionalPCDiffusionBehave
+from .pvcnn.pvcnn_ho import PVCNN2HumObj
+import torch.nn.functional as F
+from pytorch3d.renderer import PerspectiveCameras
+from .model_utils import get_num_points
+from tqdm import tqdm
+class CrossAttenHODiffusionModel(ConditionalPCDiffusionBehave):
+    def init_pcloud_model(self, kwargs, point_cloud_model, point_cloud_model_embed_dim):
+        """use cross attention model"""
+        if point_cloud_model == 'pvcnn':
+            self.point_cloud_model = PVCNN2HumObj(embed_dim=point_cloud_model_embed_dim,
+                                        num_classes=self.out_channels,
+                                        extra_feature_channels=(self.in_channels - 3),
+                                        voxel_resolution_multiplier=kwargs.get('voxel_resolution_multiplier', 1),
+                                        attn_type=kwargs.get('attn_type', 'simple-cross'),
+                                        attn_weight=kwargs.get("attn_weight", 1.0)
+                                 )
+        else:
+            raise ValueError(f"Unknown point cloud model {point_cloud_model}!")
+        self.point_visible_test = kwargs.get("point_visible_test", 'single') # when doing point visibility test, use only human points or human + object?
+        assert self.point_visible_test in ['single', 'combine'], f'invalide point visible test option {self.point_visible_test}'
+        # print(f"Point visibility test is based on {self.point_visible_test} point clouds!")
+    def forward_train(
+        self,
+        pc: Pointclouds,
+        camera: Optional[CamerasBase],
+        image_rgb: Optional[Tensor],
+        mask: Optional[Tensor],
+        return_intermediate_steps: bool = False,
+        **kwargs
+    ):
+        "additional input (RGB, mask, camera, and pc) for object is read from kwargs"
+        # assert not self.consistent_center
+        assert not self.self_conditioning
+        # Normalize colors and convert to tensor
+        x0_h = self.point_cloud_to_tensor(pc, normalize=True, scale=True)  # this will not pack the point colors
+        x0_o = self.point_cloud_to_tensor(kwargs.get('pc_obj'), normalize=True, scale=True)
+        B, N, D = x0_h.shape
+        # Sample random noise
+        noise = torch.randn_like(x0_h)
+        if self.consistent_center:
+            # modification suggested by https://arxiv.org/pdf/2308.07837.pdf
+            noise = noise - torch.mean(noise, dim=1, keepdim=True)
+        # Sample random timesteps for each point_cloud
+        timestep = torch.randint(0, self.scheduler.num_train_timesteps, (B,),
+                                 device=self.device, dtype=torch.long)
+        # timestep = torch.randint(0, 1, (B,),
+        #                          device=self.device, dtype=torch.long)
+        # Add noise to points
+        xt_h = self.scheduler.add_noise(x0_h, noise, timestep)
+        xt_o = self.scheduler.add_noise(x0_o, noise, timestep)
+        norm_parms = self.pack_norm_params(kwargs) # (2, B, 4)
+        # get input conditioning
+        x_t_input_h, x_t_input_o = self.get_image_conditioning(camera, image_rgb, kwargs, mask, norm_parms, timestep,
+                                                               xt_h, xt_o)
+        # Diffusion prediction
+        noise_pred_h, noise_pred_o = self.point_cloud_model(x_t_input_h, x_t_input_o, timestep, norm_parms)
+        # Check
+        if not noise_pred_h.shape == noise.shape:
+            raise ValueError(f'{noise_pred_h.shape=} and {noise.shape=}')
+        if not noise_pred_o.shape == noise.shape:
+            raise ValueError(f'{noise_pred_o.shape=} and {noise.shape=}')
+        # Loss
+        loss_h = F.mse_loss(noise_pred_h, noise)
+        loss_o = F.mse_loss(noise_pred_o, noise)
+        loss = loss_h + loss_o
+        # Whether to return intermediate steps
+        if return_intermediate_steps:
+            return loss, (x0_h, xt_h, noise, noise_pred_h)
+        return loss, torch.tensor([loss_h, loss_o])
+    def get_image_conditioning(self, camera, image_rgb, kwargs, mask, norm_parms, timestep, xt_h, xt_o):
+        """
+        compute image features for each point
+        :param camera:
+        :param image_rgb:
+        :param kwargs:
+        :param mask:
+        :param norm_parms:
+        :param timestep:
+        :param xt_h:
+        :param xt_o:
+        :return:
+        """
+        if self.point_visible_test == 'single':
+            # Visibility test is down independently for human and object
+            x_t_input_h = self.get_input_with_conditioning(xt_h, camera=camera,
+                                                           image_rgb=image_rgb, mask=mask, t=timestep)
+            x_t_input_o = self.get_input_with_conditioning(xt_o, camera=kwargs.get('camera_obj'),
+                                                           image_rgb=kwargs.get('rgb_obj'),
+                                                           mask=kwargs.get('mask_obj'), t=timestep)
+        elif self.point_visible_test == 'combine':
+            # Combine human + object points to do visibility test and obtain features
+            B, N = xt_h.shape[:2]  # (B, N, 3)
+            # for human: transform object points first to H+O space, then to human space
+            xt_o_in_ho = xt_o * 2 * norm_parms[1, :, 3:].unsqueeze(1) + norm_parms[1, :, :3].unsqueeze(1)
+            xt_o_in_hum = (xt_o_in_ho - norm_parms[0, :, :3].unsqueeze(1)) / (2 * norm_parms[0, :, 3:].unsqueeze(1))
+            # compute features for all points, take only first half feature for human
+            x_t_input_h = self.get_input_with_conditioning(torch.cat([xt_h, xt_o_in_hum], 1), camera=camera,
+                                                           image_rgb=image_rgb, mask=mask, t=timestep)[:,:N]
+            # for object: transform human points to H+O space, then to object space
+            xt_h_in_ho = xt_h * 2 * norm_parms[0, :, 3:].unsqueeze(1) + norm_parms[0, :, :3].unsqueeze(1)
+            xt_h_in_obj = (xt_h_in_ho - norm_parms[1, :, :3].unsqueeze(1)) / (2 * norm_parms[1, :, 3:].unsqueeze(1))
+            x_t_input_o = self.get_input_with_conditioning(torch.cat([xt_o, xt_h_in_obj], 1),
+                                                           camera=kwargs.get('camera_obj'),
+                                                           image_rgb=kwargs.get('rgb_obj'),
+                                                           mask=kwargs.get('mask_obj'), t=timestep)[:, :N]
+        else:
+            raise NotImplementedError
+        return x_t_input_h, x_t_input_o
+    def forward(self, batch, mode: str = 'train', **kwargs):
+        """"""
+        images = torch.stack(batch['images'], 0).to('cuda')
+        masks = torch.stack(batch['masks'], 0).to('cuda')
+        pc = self.get_input_pc(batch)
+        camera = PerspectiveCameras(
+            R=torch.stack(batch['R']),
+            T=torch.stack(batch['T_hum']),
+            K=torch.stack(batch['K_hum']),
+            device='cuda',
+            in_ndc=True
+        )
+        grid_df = torch.stack(batch['grid_df'], 0).to('cuda') if 'grid_df' in batch else None
+        num_points = kwargs.pop('num_points', get_num_points(pc))
+        rgb_obj = torch.stack(batch['images_obj'], 0).to('cuda')
+        masks_obj = torch.stack(batch['masks_obj'], 0).to('cuda')
+        pc_obj = Pointclouds([x.to('cuda') for x in batch['pclouds_obj']])
+        camera_obj = PerspectiveCameras(
+            R=torch.stack(batch['R']),
+            T=torch.stack(batch['T_obj']),
+            K=torch.stack(batch['K_obj']),
+            device='cuda',
+            in_ndc=True
+        )
+        # normalization parameters
+        cent_hum = torch.stack(batch['cent_hum'], 0).to('cuda')
+        cent_obj = torch.stack(batch['cent_obj'], 0).to('cuda') # B, 3
+        radius_hum = torch.stack(batch['radius_hum'], 0).to('cuda') # B, 1
+        radius_obj = torch.stack(batch['radius_obj'], 0).to('cuda')
+        # print(batch['image_path'])
+        if mode == 'train':
+            return self.forward_train(
+                pc=pc,
+                camera=camera,
+                image_rgb=images,
+                mask=masks,
+                grid_df=grid_df,
+                rgb_obj=rgb_obj,
+                mask_obj=masks_obj,
+                pc_obj=pc_obj,
+                camera_obj=camera_obj,
+                cent_hum=cent_hum,
+                cent_obj=cent_obj,
+                radius_hum=radius_hum,
+                radius_obj=radius_obj,
+            )
+        elif mode == 'sample':
+            # this use GT centers to do projection
+            return self.forward_sample(
+                num_points=num_points,
+                camera=camera,
+                image_rgb=images,
+                mask=masks,
+                gt_pc=pc,
+                rgb_obj=rgb_obj,
+                mask_obj=masks_obj,
+                pc_obj=pc_obj,
+                camera_obj=camera_obj,
+                cent_hum=cent_hum,
+                cent_obj=cent_obj,
+                radius_hum=radius_hum,
+                radius_obj=radius_obj,
+                **kwargs)
+        elif mode == 'interm-gt':
+            return self.forward_sample(
+                num_points=num_points,
+                camera=camera,
+                image_rgb=images,
+                mask=masks,
+                gt_pc=pc,
+                rgb_obj=rgb_obj,
+                mask_obj=masks_obj,
+                pc_obj=pc_obj,
+                camera_obj=camera_obj,
+                cent_hum=cent_hum,
+                cent_obj=cent_obj,
+                radius_hum=radius_hum,
+                radius_obj=radius_obj,
+                sample_from_interm=True,
+                **kwargs)
+        elif mode == 'interm-pred':
+            # use camera from predicted
+            camera = PerspectiveCameras(
+                R=torch.stack(batch['R']),
+                T=torch.stack(batch['T_hum_scaled']),
+                K=torch.stack(batch['K_hum']),
+                device='cuda',
+                in_ndc=True
+            )
+            camera_obj = PerspectiveCameras(
+                R=torch.stack(batch['R']),
+                T=torch.stack(batch['T_obj_scaled']),
+                K=torch.stack(batch['K_obj']), # the camera should be human/object specific!!!
+                device='cuda',
+                in_ndc=True
+            )
+            # use pc from predicted
+            pc = Pointclouds([x.to('cuda') for x in batch['pred_hum']])
+            pc_obj = Pointclouds([x.to('cuda') for x in batch['pred_obj']])
+            # use center and radius from predicted
+            cent_hum = torch.stack(batch['cent_hum_pred'], 0).to('cuda')
+            cent_obj = torch.stack(batch['cent_obj_pred'], 0).to('cuda')  # B, 3
+            radius_hum = torch.stack(batch['radius_hum_pred'], 0).to('cuda')  # B, 1
+            radius_obj = torch.stack(batch['radius_obj_pred'], 0).to('cuda')
+            return self.forward_sample(
+                num_points=num_points,
+                camera=camera,
+                image_rgb=images,
+                mask=masks,
+                gt_pc=pc,
+                rgb_obj=rgb_obj,
+                mask_obj=masks_obj,
+                pc_obj=pc_obj,
+                camera_obj=camera_obj,
+                cent_hum=cent_hum,
+                cent_obj=cent_obj,
+                radius_hum=radius_hum,
+                radius_obj=radius_obj,
+                sample_from_interm=True,
+                **kwargs)
+        elif mode == 'interm-pred-ts':
+            # use only estimate translation and scale, but sample from gaussian
+            # this works, the camera is GT!!!
+            pc = Pointclouds([x.to('cuda') for x in batch['pred_hum']])
+            pc_obj = Pointclouds([x.to('cuda') for x in batch['pred_obj']])
+            # use center and radius from predicted
+            cent_hum = torch.stack(batch['cent_hum_pred'], 0).to('cuda')
+            cent_obj = torch.stack(batch['cent_obj_pred'], 0).to('cuda')  # B, 3
+            radius_hum = torch.stack(batch['radius_hum_pred'], 0).to('cuda')  # B, 1
+            radius_obj = torch.stack(batch['radius_obj_pred'], 0).to('cuda')
+            # print(cent_hum[0], radius_hum[0], cent_obj[0], radius_obj[0])
+            return self.forward_sample(
+                num_points=num_points,
+                camera=camera,
+                image_rgb=images,
+                mask=masks,
+                gt_pc=pc,
+                rgb_obj=rgb_obj,
+                mask_obj=masks_obj,
+                pc_obj=pc_obj,
+                camera_obj=camera_obj,
+                cent_hum=cent_hum,
+                cent_obj=cent_obj,
+                radius_hum=radius_hum,
+                radius_obj=radius_obj,
+                sample_from_interm=False,
+                **kwargs)
+        else:
+            raise NotImplementedError
+    def forward_sample(
+        self,
+        num_points: int,
+        camera: Optional[CamerasBase],
+        image_rgb: Optional[Tensor],
+        mask: Optional[Tensor],
+        # Optional overrides
+        scheduler: Optional[str] = 'ddpm',
+        # Inference parameters
+        num_inference_steps: Optional[int] = 1000,
+        eta: Optional[float] = 0.0,  # for DDIM
+        # Whether to return all the intermediate steps in generation
+        return_sample_every_n_steps: int = -1,
+        # Whether to disable tqdm
+        disable_tqdm: bool = False,
+        gt_pc: Pointclouds = None,
+            **kwargs
+    ):
+        "use two models to run diffusion forward, and also use translation and scale to put them back"
+        assert not self.self_conditioning
+        # Get scheduler from mapping, or use self.scheduler if None
+        scheduler = self.scheduler if scheduler is None else self.schedulers_map[scheduler]
+        # Get the size of the noise
+        N = num_points
+        B = 1 if image_rgb is None else image_rgb.shape[0]
+        D = self.get_x_T_channel()
+        device = self.device if image_rgb is None else image_rgb.device
+        # sample from full steps or only a few steps
+        sample_from_interm = kwargs.get('sample_from_interm', False)
+        interm_steps = kwargs.get('noise_step') if sample_from_interm else -1
+        xt_h = self.initialize_x_T(device, gt_pc, (B, N, D), interm_steps, scheduler)
+        xt_o = self.initialize_x_T(device, kwargs.get('pc_obj', None), (B, N, D), interm_steps, scheduler)
+        # the segmentation mask
+        segm_mask = torch.zeros(B, 2*N, 1).to(device)
+        segm_mask[:, :N] = 1.0
+        # Set timesteps
+        extra_step_kwargs = self.setup_reverse_process(eta, num_inference_steps, scheduler)
+        # Loop over timesteps
+        all_outputs = []
+        return_all_outputs = (return_sample_every_n_steps > 0)
+        progress_bar = tqdm(self.get_reverse_timesteps(scheduler, interm_steps),
+                            desc=f'Sampling ({xt_h.shape})', disable=disable_tqdm)
+        # print("Camera T:", camera.T[0], camera.R[0])
+        # print("Camera_obj T:", kwargs.get('camera_obj').T[0], kwargs.get('camera_obj').R[0])
+        norm_parms = self.pack_norm_params(kwargs)
+        for i, t in enumerate(progress_bar):
+            x_t_input_h, x_t_input_o = self.get_image_conditioning(camera, image_rgb,
+                                                                   kwargs, mask,
+                                                                   norm_parms,
+                                                                   t,
+                                                                   xt_h, xt_o)
+            # One reverse step with conditioning
+            xt_h, xt_o = self.reverse_step(extra_step_kwargs, scheduler, t, torch.stack([xt_h, xt_o], 0),
+                                    torch.stack([x_t_input_h, x_t_input_o], 0), **kwargs)  # (B, N, D), D=3
+            if (return_all_outputs and (i % return_sample_every_n_steps == 0 or i == len(scheduler.timesteps) - 1)):
+                # print(xt_h.shape, kwargs.get('cent_hum').shape, kwargs.get('radius_hum').shape)
+                x_t = torch.cat([self.denormalize_pclouds(xt_h, kwargs.get('cent_hum'), kwargs.get('radius_hum')),
+                                 self.denormalize_pclouds(xt_o, kwargs.get('cent_obj'), kwargs.get('radius_obj'))], 1)
+                # print(x_t.shape, xt_o.shape)
+                all_outputs.append(torch.cat([x_t, segm_mask], -1))
+                # print("Updating intermediate...")
+        # Convert output back into a point cloud, undoing normalization and scaling
+        x_t = torch.cat([self.denormalize_pclouds(xt_h, kwargs.get('cent_hum'), kwargs.get('radius_hum')),
+                         self.denormalize_pclouds(xt_o, kwargs.get('cent_obj'), kwargs.get('radius_obj'))], 1)
+        x_t = torch.cat([x_t, segm_mask], -1)
+        output = self.tensor_to_point_cloud(x_t, denormalize=False, unscale=False)  # this convert the points back to original scale
+        if return_all_outputs:
+            all_outputs = torch.stack(all_outputs, dim=1)  # (B, sample_steps, N, D)
+            all_outputs = [self.tensor_to_point_cloud(o, denormalize=False, unscale=False) for o in all_outputs]
+        return (output, all_outputs) if return_all_outputs else output
+    def get_reverse_timesteps(self, scheduler, interm_steps:int):
+        """
+        :param scheduler:
+        :param interm_steps: start from some intermediate steps
+        :return:
+        """
+        if interm_steps > 0:
+            timesteps = torch.from_numpy(np.arange(0, interm_steps)[::-1].copy()).to(self.device)
+        else:
+            timesteps = scheduler.timesteps.to(self.device)
+        return timesteps
+    def pack_norm_params(self, kwargs:dict, scale=True):
+        scale_factor = self.scale_factor if scale else 1.0
+        hum = torch.cat([kwargs.get('cent_hum')*scale_factor, kwargs.get('radius_hum')], -1)
+        obj = torch.cat([kwargs.get('cent_obj')*scale_factor, kwargs.get('radius_obj')], -1)
+        return torch.stack([hum, obj], 0) # (2, B, 4)
+    def reverse_step(self, extra_step_kwargs, scheduler, t, x_t, x_t_input, **kwargs):
+        "x_t: (2, B, D, N), x_t_input: (2, B, D, N)"
+        norm_parms = self.pack_norm_params(kwargs) # (2, B, 4)
+        B = x_t.shape[1]
+        # print(f"Step {t} Norm params:", norm_parms[:, 0, :])
+        noise_pred_h, noise_pred_o = self.point_cloud_model(x_t_input[0], x_t_input[1], t.reshape(1).expand(B),
+                                                            norm_parms)
+        if self.consistent_center:
+            assert self.dm_pred_type != 'sample', 'incompatible dm predition type!'
+            noise_pred_h = noise_pred_h - torch.mean(noise_pred_h, dim=1, keepdim=True)
+            noise_pred_o = noise_pred_o - torch.mean(noise_pred_o, dim=1, keepdim=True)
+        xt_h = scheduler.step(noise_pred_h, t, x_t[0], **extra_step_kwargs).prev_sample
+        xt_o = scheduler.step(noise_pred_o, t, x_t[1], **extra_step_kwargs).prev_sample
+        if self.consistent_center:
+            xt_h = xt_h - torch.mean(xt_h, dim=1, keepdim=True)
+            xt_o = xt_o - torch.mean(xt_o, dim=1, keepdim=True)
+        return xt_h, xt_o
+    def denormalize_pclouds(self, x: Tensor, cent, radius, unscale: bool = True):
+        """
+        first denormalize, then apply center and scale to original H+O coordinate
+        :param x:
+        :param cent: (B, 3)
+        :param radius: (B, 1)
+        :param unscale:
+        :return:
+        """
+        # denormalize: scale down.
+        points = x[:, :, :3] / (self.scale_factor if unscale else 1)
+        # translation and scale back to H+O coordinate
+        points = points * 2 * radius.unsqueeze(-1) + cent.unsqueeze(1)
+        return points
+    def tensor_to_point_cloud(self, x: Tensor, /, denormalize: bool = False, unscale: bool = False):
+        """
+        take binary into account
+        :param self:
+        :param x: (B, N, 4)
+        :param denormalize:
+        :param unscale:
+        :return:
+        """
+        points = x[:, :, :3] / (self.scale_factor if unscale else 1)
+        if self.predict_color:
+            colors = self.denormalize(x[:, :, 3:]) if denormalize else x[:, :, 3:]
+            return Pointclouds(points=points, features=colors)
+        else:
+            assert x.shape[2] == 4
+            # add color to predicted binary labels
+            is_hum = x[:, :, 3] > 0.5
+            features = []
+            for mask in is_hum:
+                color = torch.zeros_like(x[0, :, :3]) + torch.tensor([0.5, 1.0, 0]).to(x.device)
+                color[mask, :] = torch.tensor([0.05, 1.0, 1.0]).to(x.device)  # human is light blue, object light green
+                features.append(color)
+            return Pointclouds(points=points, features=features)

model/model_utils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+from pytorch3d.structures import Pointclouds
+def set_requires_grad(module: nn.Module, requires_grad: bool):
+    for p in module.parameters():
+        p.requires_grad_(requires_grad)
+def compute_distance_transform(mask: torch.Tensor):
+    """
+    Parameters
+    ----------
+    mask (B, 1, H, W) or (B, 2, H, W) true for foreground
+    Returns
+    -------
+    the vector to the closest foreground pixel, zero if inside mask
+    """
+    C = mask.shape[1]
+    assert C in [1, 2], f'invalid mask shape {mask.shape} found!'
+    image_size = mask.shape[-1]
+    dts = []
+    for i in range(C):
+        distance_transform = torch.stack([
+            torch.from_numpy(cv2.distanceTransform(
+                (1 - m), distanceType=cv2.DIST_L2, maskSize=cv2.DIST_MASK_3
+            ) / (image_size / 2))
+            for m in mask[:, i:i+1].squeeze(1).detach().cpu().numpy().astype(np.uint8)
+        ]).unsqueeze(1).clip(0, 1).to(mask.device)
+        dts.append(distance_transform)
+    return torch.cat(dts, 1)
+def default(x, d):
+    return d if x is None else x
+def get_num_points(x: Pointclouds, /):
+    return x.points_padded().shape[1]
+def get_custom_betas(beta_start: float, beta_end: float, warmup_frac: float = 0.3, num_train_timesteps: int = 1000):
+    """Custom beta schedule"""
+    betas = np.linspace(beta_start, beta_end, num_train_timesteps, dtype=np.float32)
+    warmup_frac = 0.3
+    warmup_time = int(num_train_timesteps * warmup_frac)
+    warmup_steps = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    warmup_time = min(warmup_time, num_train_timesteps)
+    betas[:warmup_time] = warmup_steps[:warmup_time]
+    return betas

model/point_cloud_model.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from contextlib import nullcontext
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers import ModelMixin
+from torch import Tensor
+from .pvcnn.pvcnn import PVCNN2
+from .pvcnn.pvcnn_plus_plus import PVCNN2PlusPlus
+from .simple.simple_model import SimplePointModel
+class PointCloudModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        model_type: str = 'pvcnn',
+        in_channels: int = 3,
+        out_channels: int = 3,
+        embed_dim: int = 64,
+        dropout: float = 0.1,
+        width_multiplier: int = 1,
+        voxel_resolution_multiplier: int = 1,
+    ):
+        super().__init__()
+        self.model_type = model_type
+        if self.model_type == 'pvcnn':
+            self.autocast_context = torch.autocast('cuda', dtype=torch.float32)
+            self.model = PVCNN2(
+                embed_dim=embed_dim,
+                num_classes=out_channels,
+                extra_feature_channels=(in_channels - 3),
+                dropout=dropout, width_multiplier=width_multiplier,
+                voxel_resolution_multiplier=voxel_resolution_multiplier
+            )
+            self.model.classifier[-1].bias.data.normal_(0, 1e-6)
+            self.model.classifier[-1].weight.data.normal_(0, 1e-6)
+        elif self.model_type == 'pvcnnplusplus':
+            self.autocast_context = torch.autocast('cuda', dtype=torch.float32)
+            self.model = PVCNN2PlusPlus(
+                embed_dim=embed_dim,
+                num_classes=out_channels,
+                extra_feature_channels=(in_channels - 3),
+            )
+            self.model.output_projection[-1].bias.data.normal_(0, 1e-6)
+            self.model.output_projection[-1].weight.data.normal_(0, 1e-6)
+        elif self.model_type == 'simple':
+            self.autocast_context = nullcontext()
+            self.model = SimplePointModel(
+                embed_dim=embed_dim,
+                num_classes=out_channels,
+                extra_feature_channels=(in_channels - 3),
+            )
+            self.model.output_projection.bias.data.normal_(0, 1e-6)
+            self.model.output_projection.weight.data.normal_(0, 1e-6)
+        else:
+            raise NotImplementedError()
+    def forward(self, inputs: Tensor, t: Tensor, ret_feats=False) -> Tensor:
+        """ Receives input of shape (B, N, in_channels) and returns output
+            of shape (B, N, out_channels) """
+        with self.autocast_context:
+            if not ret_feats:
+                return self.model(inputs.transpose(1, 2), t, ret_feats=False).transpose(1, 2)
+            else:
+                pred, feats = self.model(inputs.transpose(1, 2), t, ret_feats=True)
+                return pred.transpose(1, 2), feats

model/point_cloud_transformer_model.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers import ModelMixin
+from torch import Tensor
+from timm.models.vision_transformer import Attention, LayerScale, DropPath, Mlp
+from .point_cloud_model import PointCloudModel
+class PointCloudModelBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        # Point cloud model
+        dim: int,
+        model_type: str = 'pvcnn',
+        dropout: float = 0.1,
+        width_multiplier: int = 1,
+        voxel_resolution_multiplier: int = 1,
+        # Transformer model
+        num_heads=6, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., init_values=None,
+        drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_attn=False
+    ):
+        super().__init__()
+        # Point cloud model
+        self.norm0 = norm_layer(dim)
+        self.point_cloud_model = PointCloudModel(model_type=model_type,
+            in_channels=dim, out_channels=dim, embed_dim=dim, dropout=dropout,
+            width_multiplier=width_multiplier, voxel_resolution_multiplier=voxel_resolution_multiplier)
+        self.ls0 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path0 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        # Attention
+        self.use_attn = use_attn
+        if self.use_attn:
+            self.norm1 = norm_layer(dim)
+            self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+            self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+            self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        # MLP
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def apply_point_cloud_model(self, x: Tensor, t: Optional[Tensor] = None) -> Tensor:
+        t = t if t is not None else torch.zeros(len(x), device=x.device, dtype=torch.long)
+        return self.point_cloud_model(x, t)
+    def forward(self, x: Tensor):
+        x = x + self.drop_path0(self.ls0(self.apply_point_cloud_model(self.norm0(x))))
+        if self.use_attn:
+            x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+class PointCloudTransformerModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, num_layers: int, in_channels: int = 3, out_channels: int = 3, embed_dim: int = 64, **kwargs):
+        super().__init__()
+        self.num_layers = num_layers
+        self.input_projection = nn.Linear(in_channels, embed_dim)
+        self.blocks = nn.Sequential(*[PointCloudModelBlock(dim=embed_dim, **kwargs) for i in range(self.num_layers)])
+        self.norm = nn.LayerNorm(embed_dim)
+        self.output_projection = nn.Linear(embed_dim, out_channels)
+    def forward(self, inputs: Tensor) -> Tensor:
+        """ Receives input of shape (B, N, in_channels) and returns output
+            of shape (B, N, out_channels) """
+        x = self.input_projection(inputs)
+        x = self.blocks(x)
+        x = self.output_projection(x)
+        return x

model/projection_model.py ADDED Viewed

	@@ -0,0 +1,273 @@

+from typing import Optional, Union
+import torch
+from diffusers.schedulers import DDIMScheduler, DDPMScheduler, PNDMScheduler
+from diffusers.schedulers.scheduling_lms_discrete import LMSDiscreteScheduler
+from diffusers import ModelMixin
+from pytorch3d.implicitron.dataset.data_loader_map_provider import FrameData
+from pytorch3d.renderer import PointsRasterizationSettings, PointsRasterizer
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.structures import Pointclouds
+from torch import Tensor
+from .feature_model import FeatureModel
+from .model_utils import compute_distance_transform
+SchedulerClass = Union[DDPMScheduler, DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+class PointCloudProjectionModel(ModelMixin):
+    def __init__(
+        self,
+        image_size: int,
+        image_feature_model: str,
+        use_local_colors: bool = True,
+        use_local_features: bool = True,
+        use_global_features: bool = False,
+        use_mask: bool = True,
+        use_distance_transform: bool = True,
+        predict_shape: bool = True,
+        predict_color: bool = False,
+        process_color: bool = False,
+        image_color_channels: int = 3,  # for the input image, not the points
+        color_channels: int = 3,  # for the points, not the input image
+        colors_mean: float = 0.5,
+        colors_std: float = 0.5,
+        scale_factor: float = 1.0,
+        # Rasterization settings
+        raster_point_radius: float = 0.0075,  # point size
+        raster_points_per_pixel: int = 1,  # a single point per pixel, for now
+        bin_size: int = 0,
+            model_name=None,
+            # additional arguments added by XH
+            load_sample_init=False,
+            sample_init_scale=1.0,
+            test_init_with_gtpc=False,
+            consistent_center=False, # from https://arxiv.org/pdf/2308.07837.pdf
+            voxel_resolution_multiplier: int=1,
+            predict_binary: bool=False, # predict a binary class label
+            lw_binary: float=1.0,
+            binary_training_noise_std: float=0.1,
+            dm_pred_type: str='epsilon', # diffusion prediction type
+            self_conditioning=False,
+            **kwargs,
+    ):
+        super().__init__()
+        self.image_size = image_size
+        self.scale_factor = scale_factor
+        self.use_local_colors = use_local_colors
+        self.use_local_features = use_local_features
+        self.use_global_features = use_global_features
+        self.use_mask = use_mask
+        self.use_distance_transform = use_distance_transform
+        self.predict_shape = predict_shape # default False
+        self.predict_color = predict_color # default True
+        self.process_color = process_color
+        self.image_color_channels = image_color_channels
+        self.color_channels = color_channels
+        self.colors_mean = colors_mean
+        self.colors_std = colors_std
+        self.model_name = model_name
+        print("PointCloud Model scale factor:", self.scale_factor, 'Model name:', self.model_name)
+        self.predict_binary = predict_binary
+        self.lw_binary = lw_binary
+        self.self_conditioning = self_conditioning
+        # Types of conditioning that are used
+        self.use_local_conditioning = self.use_local_colors or self.use_local_features or self.use_mask
+        self.use_global_conditioning = self.use_global_features
+        self.kwargs = kwargs
+        # Create feature model
+        self.feature_model = FeatureModel(image_size, image_feature_model)
+        # Input size
+        self.in_channels = 3  # 3 for 3D point positions
+        if self.use_local_colors: # whether color should be an input
+            self.in_channels += self.image_color_channels
+        if self.use_local_features:
+            self.in_channels += self.feature_model.feature_dim
+        if self.use_global_features:
+            self.in_channels += self.feature_model.feature_dim
+        if self.use_mask:
+            self.in_channels += 2 if self.use_distance_transform else 1
+        if self.process_color:
+            self.in_channels += self.color_channels # point color added to input or not, default False
+        if self.self_conditioning:
+            self.in_channels += 3 # add self conditioning
+        self.in_channels = self.add_extra_input_chennels(self.in_channels)
+        if self.model_name in ['pc2-diff-ho-sepsegm', 'diff-ho-attn']:
+            self.in_channels += 2 if self.use_distance_transform else 1
+        # Output size
+        self.out_channels = 0
+        if self.predict_shape:
+            self.out_channels += 3
+        if self.predict_color:
+            self.out_channels += self.color_channels
+        if self.predict_binary:
+            print("Output binary classification score!")
+            self.out_channels += 1
+        # Save rasterization settings
+        self.raster_settings = PointsRasterizationSettings(
+            image_size=(image_size, image_size),
+            radius=raster_point_radius,
+            points_per_pixel=raster_points_per_pixel,
+            bin_size=bin_size,
+        )
+    def add_extra_input_chennels(self, input_channels):
+        return input_channels
+    def denormalize(self, x: Tensor, /, clamp: bool = True):
+        x = x * self.colors_std + self.colors_mean
+        return torch.clamp(x, 0, 1) if clamp else x
+    def normalize(self, x: Tensor, /):
+        x = (x - self.colors_mean) / self.colors_std
+        return x
+    def get_global_conditioning(self, image_rgb: Tensor):
+        global_conditioning = []
+        if self.use_global_features:
+            global_conditioning.append(self.feature_model(image_rgb,
+                return_cls_token_only=True))  # (B, D)
+        global_conditioning = torch.cat(global_conditioning, dim=1)  # (B, D_cond)
+        return global_conditioning
+    def get_local_conditioning(self, image_rgb: Tensor, mask: Tensor):
+        """
+        compute per-point conditioning
+        Parameters
+        ----------
+        image_rgb: (B, 3, 224, 224), values normalized to 0-1, background is masked by the given mask
+        mask: (B, 1, 224, 224), or (B, 2, 224, 224) for h+o
+        """
+        local_conditioning = []
+        # import pdb; pdb.set_trace()
+        if self.use_local_colors: # XH: default True
+            local_conditioning.append(self.normalize(image_rgb))
+        if self.use_local_features: # XH: default True
+            local_conditioning.append(self.feature_model(image_rgb)) # I guess no mask here? feature model: 'vit_small_patch16_224_mae'
+        if self.use_mask: # default True
+            local_conditioning.append(mask.float())
+        if self.use_distance_transform: # default True
+            if not self.use_mask:
+                raise ValueError('No mask for distance transform?')
+            if mask.is_floating_point():
+                mask = mask > 0.5
+            local_conditioning.append(compute_distance_transform(mask))
+        local_conditioning = torch.cat(local_conditioning, dim=1)  # (B, D_cond, H, W)
+        return local_conditioning
+    @torch.autocast('cuda', dtype=torch.float32)
+    def surface_projection(
+        self, points: Tensor, camera: CamerasBase, local_features: Tensor,
+    ):
+        B, C, H, W, device = *local_features.shape, local_features.device
+        R = self.raster_settings.points_per_pixel
+        N = points.shape[1]
+        # Scale camera by scaling T. ASSUMES CAMERA IS LOOKING AT ORIGIN!
+        camera = camera.clone()
+        camera.T = camera.T * self.scale_factor
+        # Create rasterizer
+        rasterizer = PointsRasterizer(cameras=camera, raster_settings=self.raster_settings)
+        # Associate points with features via rasterization
+        fragments = rasterizer(Pointclouds(points))  # (B, H, W, R)
+        fragments_idx: Tensor = fragments.idx.long()
+        visible_pixels = (fragments_idx > -1)  # (B, H, W, R)
+        points_to_visible_pixels = fragments_idx[visible_pixels]
+        # Reshape local features to (B, H, W, R, C)
+        local_features = local_features.permute(0, 2, 3, 1).unsqueeze(-2).expand(-1, -1, -1, R, -1)  # (B, H, W, R, C)
+        # Get local features corresponding to visible points
+        local_features_proj = torch.zeros(B * N, C, device=device)
+        # local feature includes: raw RGB color, image features, mask, distance transform
+        local_features_proj[points_to_visible_pixels] = local_features[visible_pixels]
+        local_features_proj = local_features_proj.reshape(B, N, C)
+        return local_features_proj
+    def point_cloud_to_tensor(self, pc: Pointclouds, /, normalize: bool = False, scale: bool = False):
+        """Converts a point cloud to a tensor, with color if and only if self.predict_color"""
+        points = pc.points_padded() * (self.scale_factor if scale else 1)
+        if self.predict_color and pc.features_padded() is not None: # normalize color, not point locations
+            colors = self.normalize(pc.features_padded()) if normalize else pc.features_padded()
+            return torch.cat((points, colors), dim=2)
+        else:
+            return points
+    def tensor_to_point_cloud(self, x: Tensor, /, denormalize: bool = False, unscale: bool = False):
+        points = x[:, :, :3] / (self.scale_factor if unscale else 1)
+        if self.predict_color:
+            colors = self.denormalize(x[:, :, 3:]) if denormalize else x[:, :, 3:]
+            return Pointclouds(points=points, features=colors)
+        else:
+            assert x.shape[2] == 3
+            return Pointclouds(points=points)
+    def get_input_with_conditioning(
+        self,
+        x_t: Tensor,
+        camera: Optional[CamerasBase],
+        image_rgb: Optional[Tensor],
+        mask: Optional[Tensor],
+        t: Optional[Tensor],
+    ):
+        """ Extracts local features from the input image and projects them onto the points
+            in the point cloud to obtain the input to the model. Then extracts global
+            features, replicates them across points, and concats them to the input.
+            image_rgb: masked background
+            XH: why there is no positional encoding as described by the supp??
+            """
+        B, N = x_t.shape[:2]
+        # Initial input is the point locations (and colors if and only if predicting color)
+        x_t_input = self.get_coord_feature(x_t)
+        # Local conditioning
+        if self.use_local_conditioning:
+            # Get local features and check that they are the same size as the input image
+            local_features = self.get_local_conditioning(image_rgb=image_rgb, mask=mask) # concatenate RGB + mask + RGB feature + distance transform
+            if local_features.shape[-2:] != image_rgb.shape[-2:]:
+                raise ValueError(f'{local_features.shape=} and {image_rgb.shape=}')
+            # Project local features. Here that we only need the point locations, not colors
+            local_features_proj = self.surface_projection(points=x_t[:, :, :3],
+                camera=camera, local_features=local_features)  # (B, N, D_local)
+            x_t_input.append(local_features_proj)
+        # Global conditioning
+        if self.use_global_conditioning: # False
+            # Get and repeat global features
+            global_features = self.get_global_conditioning(image_rgb=image_rgb)  # (B, D_global)
+            global_features = global_features.unsqueeze(1).expand(-1, N, -1)  # (B, D_global, N)
+            x_t_input.append(global_features)
+        # Concatenate together all the pointwise features
+        x_t_input = torch.cat(x_t_input, dim=2)  # (B, N, D)
+        return x_t_input
+    def get_coord_feature(self, x_t):
+        """get coordinate feature, for model that uses separate model to predict binary, we use first 3 channels only"""
+        x_t_input = [x_t]
+        return x_t_input
+    def forward(self, batch: FrameData, mode: str = 'train', **kwargs):
+        """ The forward method may be defined differently for different models. """
+        raise NotImplementedError()

model/pvcnn/__init__.py ADDED Viewed

File without changes

model/pvcnn/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .ball_query import BallQuery, BallQueryHO
+from .frustum import FrustumPointNetLoss
+from .loss import KLLoss
+from .pointnet import PointNetAModule, PointNetSAModule, PointNetFPModule
+from .pvconv import PVConv, Attention, Swish, PVConvReLU
+from .se import SE3d
+from .shared_mlp import SharedMLP
+from .voxelization import Voxelization

model/pvcnn/modules/ball_query.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+import torch.nn as nn
+from . import functional as F
+__all__ = ['BallQuery']
+class BallQuery(nn.Module):
+    def __init__(self, radius, num_neighbors, include_coordinates=True):
+        super().__init__()
+        self.radius = radius
+        self.num_neighbors = num_neighbors
+        self.include_coordinates = include_coordinates
+    def forward(self, points_coords, centers_coords, temb, points_features=None):
+        points_coords = points_coords.contiguous()
+        centers_coords = centers_coords.contiguous()
+        neighbor_indices = F.ball_query(centers_coords, points_coords, self.radius, self.num_neighbors)
+        neighbor_coordinates = F.grouping(points_coords, neighbor_indices)
+        neighbor_coordinates = neighbor_coordinates - centers_coords.unsqueeze(-1)
+        if points_features is None:
+            assert self.include_coordinates, 'No Features For Grouping'
+            neighbor_features = neighbor_coordinates
+        else:
+            neighbor_features = F.grouping(points_features, neighbor_indices) # return [B, C, M, U] C=feat dim, M=# centers, U=# neighbours
+            if self.include_coordinates:
+                neighbor_features = torch.cat([neighbor_coordinates, neighbor_features], dim=1)
+        return neighbor_features, F.grouping(temb, neighbor_indices)
+    def extra_repr(self):
+        return 'radius={}, num_neighbors={}{}'.format(
+            self.radius, self.num_neighbors, ', include coordinates' if self.include_coordinates else '')
+class BallQueryHO(nn.Module):
+    "no point feature, but only relative and abs coordinate"
+    def __init__(self, radius, num_neighbors, include_relative=False):
+        super().__init__()
+        self.radius = radius
+        self.num_neighbors = num_neighbors
+        self.include_relative = include_relative
+    def forward(self, points_coords, centers_coords, points_features=None):
+        """
+        if not enough points inside the given radius, the entries will be zero
+        if too many points inside the radius, the order is random??? (not sure)
+        :param points_coords: (B, 3, N)
+        :param centers_coords: (B, 3, M)
+        :param points_features: None
+        :return:
+        """
+        points_coords = points_coords.contiguous()
+        centers_coords = centers_coords.contiguous()
+        neighbor_indices = F.ball_query(centers_coords, points_coords, self.radius, self.num_neighbors)
+        neighbor_coordinates = F.grouping(points_coords, neighbor_indices) # (B, 3, M, U)
+        if self.include_relative:
+            neighbor_coordinates_rela = neighbor_coordinates - centers_coords.unsqueeze(-1)
+            neighbor_coordinates = torch.cat([neighbor_coordinates, neighbor_coordinates_rela], 1) # (B, 6, M, U)
+        # flatten the coordinate
+        neighbor_coordinates = neighbor_coordinates.permute(0, 1, 3, 2) # (B, 3/6, U, M)
+        neighbor_coordinates = torch.flatten(neighbor_coordinates, 1, 2) # (B, 3*U, M)
+        return neighbor_coordinates
+    def extra_repr(self):
+        return 'radius={}, num_neighbors={}{}'.format(
+            self.radius, self.num_neighbors, ', include relative' if self.include_relative else '')

model/pvcnn/modules/frustum.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from . import functional as F
+__all__ = ['FrustumPointNetLoss', 'get_box_corners_3d']
+class FrustumPointNetLoss(nn.Module):
+    def __init__(self, num_heading_angle_bins, num_size_templates, size_templates, box_loss_weight=1.0,
+                 corners_loss_weight=10.0, heading_residual_loss_weight=20.0, size_residual_loss_weight=20.0):
+        super().__init__()
+        self.box_loss_weight = box_loss_weight
+        self.corners_loss_weight = corners_loss_weight
+        self.heading_residual_loss_weight = heading_residual_loss_weight
+        self.size_residual_loss_weight = size_residual_loss_weight
+        self.num_heading_angle_bins = num_heading_angle_bins
+        self.num_size_templates = num_size_templates
+        self.register_buffer('size_templates', size_templates.view(self.num_size_templates, 3))
+        self.register_buffer(
+            'heading_angle_bin_centers', torch.arange(0, 2 * np.pi, 2 * np.pi / self.num_heading_angle_bins)
+        )
+    def forward(self, inputs, targets):
+        mask_logits = inputs['mask_logits']  # (B, 2, N)
+        center_reg = inputs['center_reg']  # (B, 3)
+        center = inputs['center']  # (B, 3)
+        heading_scores = inputs['heading_scores']  # (B, NH)
+        heading_residuals_normalized = inputs['heading_residuals_normalized']  # (B, NH)
+        heading_residuals = inputs['heading_residuals']  # (B, NH)
+        size_scores = inputs['size_scores']  # (B, NS)
+        size_residuals_normalized = inputs['size_residuals_normalized']  # (B, NS, 3)
+        size_residuals = inputs['size_residuals']  # (B, NS, 3)
+        mask_logits_target = targets['mask_logits']  # (B, N)
+        center_target = targets['center']  # (B, 3)
+        heading_bin_id_target = targets['heading_bin_id']  # (B, )
+        heading_residual_target = targets['heading_residual']  # (B, )
+        size_template_id_target = targets['size_template_id']  # (B, )
+        size_residual_target = targets['size_residual']  # (B, 3)
+        batch_size = center.size(0)
+        batch_id = torch.arange(batch_size, device=center.device)
+        # Basic Classification and Regression losses
+        mask_loss = F.cross_entropy(mask_logits, mask_logits_target)
+        heading_loss = F.cross_entropy(heading_scores, heading_bin_id_target)
+        size_loss = F.cross_entropy(size_scores, size_template_id_target)
+        center_loss = PF.huber_loss(torch.norm(center_target - center, dim=-1), delta=2.0)
+        center_reg_loss = PF.huber_loss(torch.norm(center_target - center_reg, dim=-1), delta=1.0)
+        # Refinement losses for size/heading
+        heading_residuals_normalized = heading_residuals_normalized[batch_id, heading_bin_id_target]  # (B, )
+        heading_residual_normalized_target = heading_residual_target / (np.pi / self.num_heading_angle_bins)
+        heading_residual_normalized_loss = PF.huber_loss(
+            heading_residuals_normalized - heading_residual_normalized_target, delta=1.0
+        )
+        size_residuals_normalized = size_residuals_normalized[batch_id, size_template_id_target]  # (B, 3)
+        size_residual_normalized_target = size_residual_target / self.size_templates[size_template_id_target]
+        size_residual_normalized_loss = PF.huber_loss(
+            torch.norm(size_residual_normalized_target - size_residuals_normalized, dim=-1), delta=1.0
+        )
+        # Bounding box losses
+        heading = (heading_residuals[batch_id, heading_bin_id_target]
+                   + self.heading_angle_bin_centers[heading_bin_id_target])  # (B, )
+        # Warning: in origin code, size_residuals are added twice (issue #43 and #49 in charlesq34/frustum-pointnets)
+        size = (size_residuals[batch_id, size_template_id_target]
+                + self.size_templates[size_template_id_target])  # (B, 3)
+        corners = get_box_corners_3d(centers=center, headings=heading, sizes=size, with_flip=False)  # (B, 3, 8)
+        heading_target = self.heading_angle_bin_centers[heading_bin_id_target] + heading_residual_target  # (B, )
+        size_target = self.size_templates[size_template_id_target] + size_residual_target  # (B, 3)
+        corners_target, corners_target_flip = get_box_corners_3d(centers=center_target, headings=heading_target,
+                                                                 sizes=size_target, with_flip=True)  # (B, 3, 8)
+        corners_loss = PF.huber_loss(torch.min(
+            torch.norm(corners - corners_target, dim=1), torch.norm(corners - corners_target_flip, dim=1)
+        ), delta=1.0)
+        # Summing up
+        loss = mask_loss + self.box_loss_weight * (
+                center_loss + center_reg_loss + heading_loss + size_loss
+                + self.heading_residual_loss_weight * heading_residual_normalized_loss
+                + self.size_residual_loss_weight * size_residual_normalized_loss
+                + self.corners_loss_weight * corners_loss
+        )
+        return loss
+def get_box_corners_3d(centers, headings, sizes, with_flip=False):
+    """
+    :param centers: coords of box centers, FloatTensor[N, 3]
+    :param headings: heading angles, FloatTensor[N, ]
+    :param sizes: box sizes, FloatTensor[N, 3]
+    :param with_flip: bool, whether to return flipped box (headings + np.pi)
+    :return:
+        coords of box corners, FloatTensor[N, 3, 8]
+        NOTE: corner points are in counter clockwise order, e.g.,
+          2--1
+        3--0 5
+        7--4
+    """
+    l = sizes[:, 0]  # (N,)
+    w = sizes[:, 1]  # (N,)
+    h = sizes[:, 2]  # (N,)
+    x_corners = torch.stack([l/2, l/2, -l/2, -l/2, l/2, l/2, -l/2, -l/2], dim=1)  # (N, 8)
+    y_corners = torch.stack([h/2, h/2, h/2, h/2, -h/2, -h/2, -h/2, -h/2], dim=1)  # (N, 8)
+    z_corners = torch.stack([w/2, -w/2, -w/2, w/2, w/2, -w/2, -w/2, w/2], dim=1)  # (N, 8)
+    c = torch.cos(headings)  # (N,)
+    s = torch.sin(headings)  # (N,)
+    o = torch.ones_like(headings)  # (N,)
+    z = torch.zeros_like(headings)  # (N,)
+    centers = centers.unsqueeze(-1)  # (B, 3, 1)
+    corners = torch.stack([x_corners, y_corners, z_corners], dim=1)  # (N, 3, 8)
+    R = torch.stack([c, z, s, z, o, z, -s, z, c], dim=1).view(-1, 3, 3)  # roty matrix: (N, 3, 3)
+    if with_flip:
+        R_flip = torch.stack([-c, z, -s, z, o, z, s, z, -c], dim=1).view(-1, 3, 3)
+        return torch.matmul(R, corners) + centers, torch.matmul(R_flip, corners) + centers
+    else:
+        return torch.matmul(R, corners) + centers
+    # centers = centers.unsqueeze(1)  # (B, 1, 3)
+    # corners = torch.stack([x_corners, y_corners, z_corners], dim=-1)  # (N, 8, 3)
+    # RT = torch.stack([c, z, -s, z, o, z, s, z, c], dim=1).view(-1, 3, 3)  # (N, 3, 3)
+    # if with_flip:
+    #     RT_flip = torch.stack([-c, z, s, z, o, z, -s, z, -c], dim=1).view(-1, 3, 3)  # (N, 3, 3)
+    #     return torch.matmul(corners, RT) + centers, torch.matmul(corners, RT_flip) + centers  # (N, 8, 3)
+    # else:
+    #     return torch.matmul(corners, RT) + centers  # (N, 8, 3)
+    # corners = torch.stack([x_corners, y_corners, z_corners], dim=1)  # (N, 3, 8)
+    # R = torch.stack([c, z, s, z, o, z, -s, z, c], dim=1).view(-1, 3, 3)  # (N, 3, 3)
+    # corners = torch.matmul(R, corners) + centers.unsqueeze(2)  # (N, 3, 8)
+    # corners = corners.transpose(1, 2)  # (N, 8, 3)

model/pvcnn/modules/functional/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .ball_query import ball_query
+from .devoxelization import trilinear_devoxelize
+from .grouping import grouping
+from .interpolatation import nearest_neighbor_interpolate
+from .loss import kl_loss, huber_loss
+from .sampling import gather, furthest_point_sample, logits_mask
+from .voxelization import avg_voxelize

model/pvcnn/modules/functional/backend.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+from pathlib import Path
+from torch.utils.cpp_extension import load
+gcc_path = os.getenv('CC', default='/usr/bin/gcc')
+if not Path(gcc_path).is_file():
+    raise ValueError('Could not find your gcc, please replace it here.')
+_src_path = os.path.dirname(os.path.abspath(__file__))
+_backend = load(
+    name='_pvcnn_backend',
+    extra_cflags=['-O3', '-std=c++17'],
+    extra_cuda_cflags=[f'--compiler-bindir={gcc_path}'],
+    sources=[os.path.join(_src_path,'src', f) for f in [
+        'ball_query/ball_query.cpp',
+        'ball_query/ball_query.cu',
+        'grouping/grouping.cpp',
+        'grouping/grouping.cu',
+        'interpolate/neighbor_interpolate.cpp',
+        'interpolate/neighbor_interpolate.cu',
+        'interpolate/trilinear_devox.cpp',
+        'interpolate/trilinear_devox.cu',
+        'sampling/sampling.cpp',
+        'sampling/sampling.cu',
+        'voxelization/vox.cpp',
+        'voxelization/vox.cu',
+        'bindings.cpp',
+    ]]
+)
+__all__ = ['_backend']

model/pvcnn/modules/functional/ball_query.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from torch.autograd import Function
+from .backend import _backend
+__all__ = ['ball_query']
+def ball_query(centers_coords, points_coords, radius, num_neighbors):
+        """
+        :param centers_coords: coordinates of centers, FloatTensor[B, 3, M]
+        :param points_coords: coordinates of points, FloatTensor[B, 3, N]
+        :param radius: float, radius of ball query
+        :param num_neighbors: int, maximum number of neighbors
+        :return:
+            neighbor_indices: indices of neighbors, IntTensor[B, M, U]
+        """
+        centers_coords = centers_coords.contiguous()
+        points_coords = points_coords.contiguous()
+        return _backend.ball_query(centers_coords, points_coords, radius, num_neighbors)

model/pvcnn/modules/functional/devoxelization.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from torch.autograd import Function
+from .backend import _backend
+__all__ = ['trilinear_devoxelize']
+class TrilinearDevoxelization(Function):
+    @staticmethod
+    def forward(ctx, features, coords, resolution, is_training=True):
+        """
+        :param ctx:
+        :param coords: the coordinates of points, FloatTensor[B, 3, N]
+        :param features: FloatTensor[B, C, R, R, R]
+        :param resolution: int, the voxel resolution
+        :param is_training: bool, training mode
+        :return:
+            FloatTensor[B, C, N]
+        """
+        B, C = features.shape[:2]
+        features = features.contiguous().view(B, C, -1)
+        coords = coords.contiguous()
+        outs, inds, wgts = _backend.trilinear_devoxelize_forward(resolution, is_training, coords, features)
+        if is_training:
+            ctx.save_for_backward(inds, wgts)
+            ctx.r = resolution
+        return outs
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        :param ctx:
+        :param grad_output: gradient of outputs, FloatTensor[B, C, N]
+        :return:
+            gradient of inputs, FloatTensor[B, C, R, R, R]
+        """
+        inds, wgts = ctx.saved_tensors
+        grad_inputs = _backend.trilinear_devoxelize_backward(grad_output.contiguous(), inds, wgts, ctx.r)
+        return grad_inputs.view(grad_output.size(0), grad_output.size(1), ctx.r, ctx.r, ctx.r), None, None, None
+trilinear_devoxelize = TrilinearDevoxelization.apply

model/pvcnn/modules/functional/grouping.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from torch.autograd import Function
+from .backend import _backend
+__all__ = ['grouping']
+class Grouping(Function):
+    @staticmethod
+    def forward(ctx, features, indices):
+        """
+        :param ctx:
+        :param features: features of points, FloatTensor[B, C, N]
+        :param indices: neighbor indices of centers, IntTensor[B, M, U], M is #centers, U is #neighbors
+        :return:
+            grouped_features: grouped features, FloatTensor[B, C, M, U]
+        """
+        features = features.contiguous()
+        indices = indices.contiguous()
+        ctx.save_for_backward(indices)
+        ctx.num_points = features.size(-1)
+        # print(features.dtype, features.shape)
+        return _backend.grouping_forward(features, indices)
+    @staticmethod
+    def backward(ctx, grad_output):
+        indices, = ctx.saved_tensors
+        grad_features = _backend.grouping_backward(grad_output.contiguous(), indices, ctx.num_points)
+        return grad_features, None
+grouping = Grouping.apply

model/pvcnn/modules/functional/interpolatation.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from torch.autograd import Function
+from .backend import _backend
+__all__ = ['nearest_neighbor_interpolate']
+class NeighborInterpolation(Function):
+    @staticmethod
+    def forward(ctx, points_coords, centers_coords, centers_features):
+        """
+        :param ctx:
+        :param points_coords: coordinates of points, FloatTensor[B, 3, N]
+        :param centers_coords: coordinates of centers, FloatTensor[B, 3, M]
+        :param centers_features: features of centers, FloatTensor[B, C, M]
+        :return:
+            points_features: features of points, FloatTensor[B, C, N]
+        """
+        centers_coords = centers_coords.contiguous()
+        points_coords = points_coords.contiguous()
+        centers_features = centers_features.contiguous()
+        points_features, indices, weights = _backend.three_nearest_neighbors_interpolate_forward(
+            points_coords, centers_coords, centers_features
+        )
+        ctx.save_for_backward(indices, weights)
+        ctx.num_centers = centers_coords.size(-1)
+        return points_features
+    @staticmethod
+    def backward(ctx, grad_output):
+        indices, weights = ctx.saved_tensors
+        grad_centers_features = _backend.three_nearest_neighbors_interpolate_backward(
+            grad_output.contiguous(), indices, weights, ctx.num_centers
+        )
+        return None, None, grad_centers_features
+nearest_neighbor_interpolate = NeighborInterpolation.apply

model/pvcnn/modules/functional/loss.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+import torch.nn.functional as F
+__all__ = ['kl_loss', 'huber_loss']
+def kl_loss(x, y):
+    x = F.softmax(x.detach(), dim=1)
+    y = F.log_softmax(y, dim=1)
+    return torch.mean(torch.sum(x * (torch.log(x) - y), dim=1))
+def huber_loss(error, delta):
+    abs_error = torch.abs(error)
+    quadratic = torch.min(abs_error, torch.full_like(abs_error, fill_value=delta))
+    losses = 0.5 * (quadratic ** 2) + delta * (abs_error - quadratic)
+    return torch.mean(losses)

model/pvcnn/modules/functional/sampling.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import numpy as np
+import torch
+from torch.autograd import Function
+from .backend import _backend
+__all__ = ['gather', 'furthest_point_sample', 'logits_mask']
+class Gather(Function):
+    @staticmethod
+    def forward(ctx, features, indices):
+        """
+        Gather
+        :param ctx:
+        :param features: features of points, FloatTensor[B, C, N]
+        :param indices: centers' indices in points, IntTensor[b, m]
+        :return:
+            centers_coords: coordinates of sampled centers, FloatTensor[B, C, M]
+        """
+        features = features.contiguous()
+        indices = indices.int().contiguous()
+        ctx.save_for_backward(indices)
+        ctx.num_points = features.size(-1)
+        return _backend.gather_features_forward(features, indices)
+    @staticmethod
+    def backward(ctx, grad_output):
+        indices, = ctx.saved_tensors
+        grad_features = _backend.gather_features_backward(grad_output.contiguous(), indices, ctx.num_points)
+        return grad_features, None
+gather = Gather.apply
+def furthest_point_sample(coords, num_samples):
+    """
+    Uses iterative furthest point sampling to select a set of npoint features that have the largest
+    minimum distance to the sampled point set
+    :param coords: coordinates of points, FloatTensor[B, 3, N]
+    :param num_samples: int, M
+    :return:
+       centers_coords: coordinates of sampled centers, FloatTensor[B, 3, M]
+    """
+    coords = coords.contiguous()
+    indices = _backend.furthest_point_sampling(coords, num_samples)
+    return gather(coords, indices)
+def logits_mask(coords, logits, num_points_per_object):
+    """
+    Use logits to sample points
+    :param coords: coords of points, FloatTensor[B, 3, N]
+    :param logits: binary classification logits, FloatTensor[B, 2, N]
+    :param num_points_per_object: M, #points per object after masking, int
+    :return:
+        selected_coords: FloatTensor[B, 3, M]
+        masked_coords_mean: mean coords of selected points, FloatTensor[B, 3]
+        mask: mask to select points, BoolTensor[B, N]
+    """
+    batch_size, _, num_points = coords.shape
+    mask = torch.lt(logits[:, 0, :], logits[:, 1, :])   # [B, N]
+    num_candidates = torch.sum(mask, dim=-1, keepdim=True)  # [B, 1]
+    masked_coords = coords * mask.view(batch_size, 1, num_points)  # [B, C, N]
+    masked_coords_mean = torch.sum(masked_coords, dim=-1) / torch.max(num_candidates,
+                                                                      torch.ones_like(num_candidates)).float()  # [B, C]
+    selected_indices = torch.zeros((batch_size, num_points_per_object), device=coords.device, dtype=torch.int32)
+    for i in range(batch_size):
+        current_mask = mask[i]  # [N]
+        current_candidates = current_mask.nonzero().view(-1)
+        current_num_candidates = current_candidates.numel()
+        if current_num_candidates >= num_points_per_object:
+            choices = np.random.choice(current_num_candidates, num_points_per_object, replace=False)
+            selected_indices[i] = current_candidates[choices]
+        elif current_num_candidates > 0:
+            choices = np.concatenate([
+                np.arange(current_num_candidates).repeat(num_points_per_object // current_num_candidates),
+                np.random.choice(current_num_candidates, num_points_per_object % current_num_candidates, replace=False)
+            ])
+            np.random.shuffle(choices)
+            selected_indices[i] = current_candidates[choices]
+    selected_coords = gather(masked_coords - masked_coords_mean.view(batch_size, -1, 1), selected_indices)
+    return selected_coords, masked_coords_mean, mask

model/pvcnn/modules/functional/src/ball_query/ball_query.cpp ADDED Viewed

	@@ -0,0 +1,30 @@

+#include "ball_query.hpp"
+#include "ball_query.cuh"
+#include "../utils.hpp"
+at::Tensor ball_query_forward(at::Tensor centers_coords,
+                              at::Tensor points_coords, const float radius,
+                              const int num_neighbors) {
+  CHECK_CUDA(centers_coords);
+  CHECK_CUDA(points_coords);
+  CHECK_CONTIGUOUS(centers_coords);
+  CHECK_CONTIGUOUS(points_coords);
+  CHECK_IS_FLOAT(centers_coords);
+  CHECK_IS_FLOAT(points_coords);
+  int b = centers_coords.size(0);
+  int m = centers_coords.size(2);
+  int n = points_coords.size(2);
+  at::Tensor neighbors_indices = torch::zeros(
+      {b, m, num_neighbors},
+      at::device(centers_coords.device()).dtype(at::ScalarType::Int));
+  ball_query(b, n, m, radius * radius, num_neighbors,
+             centers_coords.data_ptr<float>(),
+             points_coords.data_ptr<float>(),
+             neighbors_indices.data_ptr<int>());
+  return neighbors_indices;
+}

model/pvcnn/modules/functional/src/ball_query/ball_query.cu ADDED Viewed

	@@ -0,0 +1,59 @@

+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../cuda_utils.cuh"
+/*
+  Function: ball query
+  Args:
+    b   : batch size
+    n   : number of points in point clouds
+    m   : number of query centers
+    r2  : ball query radius ** 2
+    u   : maximum number of neighbors
+    centers_coords: coordinates of centers, FloatTensor[b, 3, m]
+    points_coords : coordinates of points, FloatTensor[b, 3, n]
+    neighbors_indices : neighbor indices in points, IntTensor[b, m, u]
+*/
+__global__ void ball_query_kernel(int b, int n, int m, float r2, int u,
+                                  const float *__restrict__ centers_coords,
+                                  const float *__restrict__ points_coords,
+                                  int *__restrict__ neighbors_indices) {
+  int batch_index = blockIdx.x;
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+  points_coords += batch_index * n * 3;
+  centers_coords += batch_index * m * 3;
+  neighbors_indices += batch_index * m * u;
+  for (int j = index; j < m; j += stride) {
+    float center_x = centers_coords[j];
+    float center_y = centers_coords[j + m];
+    float center_z = centers_coords[j + m + m];
+    for (int k = 0, cnt = 0; k < n && cnt < u; ++k) {
+      float dx = center_x - points_coords[k];
+      float dy = center_y - points_coords[k + n];
+      float dz = center_z - points_coords[k + n + n];
+      float d2 = dx * dx + dy * dy + dz * dz;
+      if (d2 < r2) {
+        if (cnt == 0) {
+          for (int v = 0; v < u; ++v) {
+            neighbors_indices[j * u + v] = k;
+          }
+        }
+        neighbors_indices[j * u + cnt] = k;
+        ++cnt;
+      }
+    }
+  }
+}
+void ball_query(int b, int n, int m, float r2, int u,
+                const float *centers_coords, const float *points_coords,
+                int *neighbors_indices) {
+  ball_query_kernel<<<b, optimal_num_threads(m), 0,
+                      at::cuda::getCurrentCUDAStream()>>>(
+      b, n, m, r2, u, centers_coords, points_coords, neighbors_indices);
+  CUDA_CHECK_ERRORS();
+}

model/pvcnn/modules/functional/src/ball_query/ball_query.cuh ADDED Viewed

	@@ -0,0 +1,8 @@

+#ifndef _BALL_QUERY_CUH
+#define _BALL_QUERY_CUH
+void ball_query(int b, int n, int m, float r2, int u,
+                const float *centers_coords, const float *points_coords,
+                int *neighbors_indices);
+#endif

model/pvcnn/modules/functional/src/ball_query/ball_query.hpp ADDED Viewed

	@@ -0,0 +1,10 @@

+#ifndef _BALL_QUERY_HPP
+#define _BALL_QUERY_HPP
+#include <torch/extension.h>
+at::Tensor ball_query_forward(at::Tensor centers_coords,
+                              at::Tensor points_coords, const float radius,
+                              const int num_neighbors);
+#endif

model/pvcnn/modules/functional/src/bindings.cpp ADDED Viewed

	@@ -0,0 +1,37 @@

+#include <pybind11/pybind11.h>
+#include "ball_query/ball_query.hpp"
+#include "grouping/grouping.hpp"
+#include "interpolate/neighbor_interpolate.hpp"
+#include "interpolate/trilinear_devox.hpp"
+#include "sampling/sampling.hpp"
+#include "voxelization/vox.hpp"
+PYBIND11_MODULE(_pvcnn_backend, m) {
+  m.def("gather_features_forward", &gather_features_forward,
+        "Gather Centers' Features forward (CUDA)");
+  m.def("gather_features_backward", &gather_features_backward,
+        "Gather Centers' Features backward (CUDA)");
+  m.def("furthest_point_sampling", &furthest_point_sampling_forward,
+        "Furthest Point Sampling (CUDA)");
+  m.def("ball_query", &ball_query_forward, "Ball Query (CUDA)");
+  m.def("grouping_forward", &grouping_forward,
+        "Grouping Features forward (CUDA)");
+  m.def("grouping_backward", &grouping_backward,
+        "Grouping Features backward (CUDA)");
+  m.def("three_nearest_neighbors_interpolate_forward",
+        &three_nearest_neighbors_interpolate_forward,
+        "3 Nearest Neighbors Interpolate forward (CUDA)");
+  m.def("three_nearest_neighbors_interpolate_backward",
+        &three_nearest_neighbors_interpolate_backward,
+        "3 Nearest Neighbors Interpolate backward (CUDA)");
+  m.def("trilinear_devoxelize_forward", &trilinear_devoxelize_forward,
+        "Trilinear Devoxelization forward (CUDA)");
+  m.def("trilinear_devoxelize_backward", &trilinear_devoxelize_backward,
+        "Trilinear Devoxelization backward (CUDA)");
+  m.def("avg_voxelize_forward", &avg_voxelize_forward,
+        "Voxelization forward with average pooling (CUDA)");
+  m.def("avg_voxelize_backward", &avg_voxelize_backward,
+        "Voxelization backward (CUDA)");
+}

model/pvcnn/modules/functional/src/cuda_utils.cuh ADDED Viewed

	@@ -0,0 +1,39 @@

+#ifndef _CUDA_UTILS_H
+#define _CUDA_UTILS_H
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cmath>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#define MAXIMUM_THREADS 512
+inline int optimal_num_threads(int work_size) {
+  const int pow_2 = std::log2(static_cast<double>(work_size));
+  return max(min(1 << pow_2, MAXIMUM_THREADS), 1);
+}
+inline dim3 optimal_block_config(int x, int y) {
+  const int x_threads = optimal_num_threads(x);
+  const int y_threads =
+      max(min(optimal_num_threads(y), MAXIMUM_THREADS / x_threads), 1);
+  dim3 block_config(x_threads, y_threads, 1);
+  return block_config;
+}
+#define CUDA_CHECK_ERRORS()                                                    \
+  {                                                                            \
+    cudaError_t err = cudaGetLastError();                                      \
+    if (cudaSuccess != err) {                                                  \
+      fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n",           \
+              cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__,          \
+              __FILE__);                                                       \
+      exit(-1);                                                                \
+    }                                                                          \
+  }
+#endif

model/pvcnn/modules/functional/src/grouping/grouping.cpp ADDED Viewed

	@@ -0,0 +1,44 @@

+#include "grouping.hpp"
+#include "grouping.cuh"
+#include "../utils.hpp"
+at::Tensor grouping_forward(at::Tensor features, at::Tensor indices) {
+  CHECK_CUDA(features);
+  CHECK_CUDA(indices);
+  CHECK_CONTIGUOUS(features);
+  CHECK_CONTIGUOUS(indices);
+  CHECK_IS_FLOAT(features);
+  CHECK_IS_INT(indices);
+  int b = features.size(0);
+  int c = features.size(1);
+  int n = features.size(2);
+  int m = indices.size(1);
+  int u = indices.size(2);
+  at::Tensor output = torch::zeros(
+      {b, c, m, u}, at::device(features.device()).dtype(at::ScalarType::Float));
+  grouping(b, c, n, m, u, features.data_ptr<float>(), indices.data_ptr<int>(),
+           output.data_ptr<float>());
+  return output;
+}
+at::Tensor grouping_backward(at::Tensor grad_y, at::Tensor indices,
+                             const int n) {
+  CHECK_CUDA(grad_y);
+  CHECK_CUDA(indices);
+  CHECK_CONTIGUOUS(grad_y);
+  CHECK_CONTIGUOUS(indices);
+  CHECK_IS_FLOAT(grad_y);
+  CHECK_IS_INT(indices);
+  int b = grad_y.size(0);
+  int c = grad_y.size(1);
+  int m = indices.size(1);
+  int u = indices.size(2);
+  at::Tensor grad_x = torch::zeros(
+      {b, c, n}, at::device(grad_y.device()).dtype(at::ScalarType::Float));
+  grouping_grad(b, c, n, m, u, grad_y.data_ptr<float>(),
+                indices.data_ptr<int>(), grad_x.data_ptr<float>());
+  return grad_x;
+}

model/pvcnn/modules/functional/src/grouping/grouping.cu ADDED Viewed

	@@ -0,0 +1,85 @@

+#include <stdio.h>
+#include <stdlib.h>
+#include "../cuda_utils.cuh"
+/*
+  Function: grouping features of neighbors (forward)
+  Args:
+    b   : batch size
+    c   : #channles of features
+    n   : number of points in point clouds
+    m   : number of query centers
+    u   : maximum number of neighbors
+    features: points' features, FloatTensor[b, c, n]
+    indices : neighbor indices in points, IntTensor[b, m, u]
+    out     : gathered features, FloatTensor[b, c, m, u]
+*/
+__global__ void grouping_kernel(int b, int c, int n, int m, int u,
+                                const float *__restrict__ features,
+                                const int *__restrict__ indices,
+                                float *__restrict__ out) {
+  int batch_index = blockIdx.x;
+  features += batch_index * n * c;
+  indices += batch_index * m * u;
+  out += batch_index * m * u * c;
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * m; i += stride) {
+    const int l = i / m;
+    const int j = i % m;
+    for (int k = 0; k < u; ++k) {
+      out[(l * m + j) * u + k] = features[l * n + indices[j * u + k]];
+    }
+  }
+}
+void grouping(int b, int c, int n, int m, int u, const float *features,
+              const int *indices, float *out) {
+  grouping_kernel<<<b, optimal_block_config(m, c), 0,
+                    at::cuda::getCurrentCUDAStream()>>>(b, c, n, m, u, features,
+                                                        indices, out);
+  CUDA_CHECK_ERRORS();
+}
+/*
+  Function: grouping features of neighbors (backward)
+  Args:
+    b   : batch size
+    c   : #channles of features
+    n   : number of points in point clouds
+    m   : number of query centers
+    u   : maximum number of neighbors
+    grad_y : grad of gathered features, FloatTensor[b, c, m, u]
+    indices : neighbor indices in points, IntTensor[b, m, u]
+    grad_x: grad of points' features, FloatTensor[b, c, n]
+*/
+__global__ void grouping_grad_kernel(int b, int c, int n, int m, int u,
+                                     const float *__restrict__ grad_y,
+                                     const int *__restrict__ indices,
+                                     float *__restrict__ grad_x) {
+  int batch_index = blockIdx.x;
+  grad_y += batch_index * m * u * c;
+  indices += batch_index * m * u;
+  grad_x += batch_index * n * c;
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * m; i += stride) {
+    const int l = i / m;
+    const int j = i % m;
+    for (int k = 0; k < u; ++k) {
+      atomicAdd(grad_x + l * n + indices[j * u + k],
+                grad_y[(l * m + j) * u + k]);
+    }
+  }
+}
+void grouping_grad(int b, int c, int n, int m, int u, const float *grad_y,
+                   const int *indices, float *grad_x) {
+  grouping_grad_kernel<<<b, optimal_block_config(m, c), 0,
+                         at::cuda::getCurrentCUDAStream()>>>(
+      b, c, n, m, u, grad_y, indices, grad_x);
+  CUDA_CHECK_ERRORS();
+}

model/pvcnn/modules/functional/src/grouping/grouping.cuh ADDED Viewed

	@@ -0,0 +1,9 @@

+#ifndef _GROUPING_CUH
+#define _GROUPING_CUH
+void grouping(int b, int c, int n, int m, int u, const float *features,
+              const int *indices, float *out);
+void grouping_grad(int b, int c, int n, int m, int u, const float *grad_y,
+                   const int *indices, float *grad_x);
+#endif

model/pvcnn/modules/functional/src/grouping/grouping.hpp ADDED Viewed

	@@ -0,0 +1,10 @@

+#ifndef _GROUPING_HPP
+#define _GROUPING_HPP
+#include <torch/extension.h>
+at::Tensor grouping_forward(at::Tensor features, at::Tensor indices);
+at::Tensor grouping_backward(at::Tensor grad_y, at::Tensor indices,
+                             const int n);
+#endif

model/pvcnn/modules/functional/src/interpolate/neighbor_interpolate.cpp ADDED Viewed

	@@ -0,0 +1,65 @@

+#include "neighbor_interpolate.hpp"
+#include "neighbor_interpolate.cuh"
+#include "../utils.hpp"
+std::vector<at::Tensor>
+three_nearest_neighbors_interpolate_forward(at::Tensor points_coords,
+                                            at::Tensor centers_coords,
+                                            at::Tensor centers_features) {
+  CHECK_CUDA(points_coords);
+  CHECK_CUDA(centers_coords);
+  CHECK_CUDA(centers_features);
+  CHECK_CONTIGUOUS(points_coords);
+  CHECK_CONTIGUOUS(centers_coords);
+  CHECK_CONTIGUOUS(centers_features);
+  CHECK_IS_FLOAT(points_coords);
+  CHECK_IS_FLOAT(centers_coords);
+  CHECK_IS_FLOAT(centers_features);
+  int b = centers_features.size(0);
+  int c = centers_features.size(1);
+  int m = centers_features.size(2);
+  int n = points_coords.size(2);
+  at::Tensor indices = torch::zeros(
+      {b, 3, n}, at::device(points_coords.device()).dtype(at::ScalarType::Int));
+  at::Tensor weights = torch::zeros(
+      {b, 3, n},
+      at::device(points_coords.device()).dtype(at::ScalarType::Float));
+  at::Tensor output = torch::zeros(
+      {b, c, n},
+      at::device(centers_features.device()).dtype(at::ScalarType::Float));
+  three_nearest_neighbors_interpolate(
+      b, c, m, n, points_coords.data_ptr<float>(),
+      centers_coords.data_ptr<float>(), centers_features.data_ptr<float>(),
+      indices.data_ptr<int>(), weights.data_ptr<float>(),
+      output.data_ptr<float>());
+  return {output, indices, weights};
+}
+at::Tensor three_nearest_neighbors_interpolate_backward(at::Tensor grad_y,
+                                                        at::Tensor indices,
+                                                        at::Tensor weights,
+                                                        const int m) {
+  CHECK_CUDA(grad_y);
+  CHECK_CUDA(indices);
+  CHECK_CUDA(weights);
+  CHECK_CONTIGUOUS(grad_y);
+  CHECK_CONTIGUOUS(indices);
+  CHECK_CONTIGUOUS(weights);
+  CHECK_IS_FLOAT(grad_y);
+  CHECK_IS_INT(indices);
+  CHECK_IS_FLOAT(weights);
+  int b = grad_y.size(0);
+  int c = grad_y.size(1);
+  int n = grad_y.size(2);
+  at::Tensor grad_x = torch::zeros(
+      {b, c, m}, at::device(grad_y.device()).dtype(at::ScalarType::Float));
+  three_nearest_neighbors_interpolate_grad(
+      b, c, n, m, grad_y.data_ptr<float>(), indices.data_ptr<int>(),
+      weights.data_ptr<float>(), grad_x.data_ptr<float>());
+  return grad_x;
+}

model/pvcnn/modules/functional/src/interpolate/neighbor_interpolate.cu ADDED Viewed

	@@ -0,0 +1,181 @@

+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../cuda_utils.cuh"
+/*
+  Function: three nearest neighbors
+  Args:
+    b   : batch size
+    n   : number of points in point clouds
+    m   : number of query centers
+    points_coords : coordinates of points, FloatTensor[b, 3, n]
+    centers_coords: coordinates of centers, FloatTensor[b, 3, m]
+    weights       : weights of nearest 3 centers to the point,
+                    FloatTensor[b, 3, n]
+    indices       : indices of nearest 3 centers to the point,
+                    IntTensor[b, 3, n]
+*/
+__global__ void three_nearest_neighbors_kernel(
+    int b, int n, int m, const float *__restrict__ points_coords,
+    const float *__restrict__ centers_coords, float *__restrict__ weights,
+    int *__restrict__ indices) {
+  int batch_index = blockIdx.x;
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+  points_coords += batch_index * 3 * n;
+  weights += batch_index * 3 * n;
+  indices += batch_index * 3 * n;
+  centers_coords += batch_index * 3 * m;
+  for (int j = index; j < n; j += stride) {
+    float ux = points_coords[j];
+    float uy = points_coords[j + n];
+    float uz = points_coords[j + n + n];
+    double best0 = 1e40, best1 = 1e40, best2 = 1e40;
+    int besti0 = 0, besti1 = 0, besti2 = 0;
+    for (int k = 0; k < m; ++k) {
+      float x = centers_coords[k];
+      float y = centers_coords[k + m];
+      float z = centers_coords[k + m + m];
+      float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      if (d < best2) {
+        best2 = d;
+        besti2 = k;
+        if (d < best1) {
+          best2 = best1;
+          besti2 = besti1;
+          best1 = d;
+          besti1 = k;
+          if (d < best0) {
+            best1 = best0;
+            besti1 = besti0;
+            best0 = d;
+            besti0 = k;
+          }
+        }
+      }
+    }
+    best0 = max(min(1e10f, best0), 1e-10f);
+    best1 = max(min(1e10f, best1), 1e-10f);
+    best2 = max(min(1e10f, best2), 1e-10f);
+    float d0d1 = best0 * best1;
+    float d0d2 = best0 * best2;
+    float d1d2 = best1 * best2;
+    float d0d1d2 = 1.0f / (d0d1 + d0d2 + d1d2);
+    weights[j] = d1d2 * d0d1d2;
+    indices[j] = besti0;
+    weights[j + n] = d0d2 * d0d1d2;
+    indices[j + n] = besti1;
+    weights[j + n + n] = d0d1 * d0d1d2;
+    indices[j + n + n] = besti2;
+  }
+}
+/*
+  Function: interpolate three nearest neighbors (forward)
+  Args:
+    b   : batch size
+    c   : #channels of features
+    m   : number of query centers
+    n   : number of points in point clouds
+    centers_features: features of centers, FloatTensor[b, c, m]
+    indices         : indices of nearest 3 centers to the point,
+                      IntTensor[b, 3, n]
+    weights         : weights for interpolation, FloatTensor[b, 3, n]
+    out             : features of points, FloatTensor[b, c, n]
+*/
+__global__ void three_nearest_neighbors_interpolate_kernel(
+    int b, int c, int m, int n, const float *__restrict__ centers_features,
+    const int *__restrict__ indices, const float *__restrict__ weights,
+    float *__restrict__ out) {
+  int batch_index = blockIdx.x;
+  centers_features += batch_index * m * c;
+  indices += batch_index * n * 3;
+  weights += batch_index * n * 3;
+  out += batch_index * n * c;
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * n; i += stride) {
+    const int l = i / n;
+    const int j = i % n;
+    float w1 = weights[j];
+    float w2 = weights[j + n];
+    float w3 = weights[j + n + n];
+    int i1 = indices[j];
+    int i2 = indices[j + n];
+    int i3 = indices[j + n + n];
+    out[i] = centers_features[l * m + i1] * w1 +
+             centers_features[l * m + i2] * w2 +
+             centers_features[l * m + i3] * w3;
+  }
+}
+void three_nearest_neighbors_interpolate(int b, int c, int m, int n,
+                                         const float *points_coords,
+                                         const float *centers_coords,
+                                         const float *centers_features,
+                                         int *indices, float *weights,
+                                         float *out) {
+  three_nearest_neighbors_kernel<<<b, optimal_num_threads(n), 0,
+                                   at::cuda::getCurrentCUDAStream()>>>(
+      b, n, m, points_coords, centers_coords, weights, indices);
+  three_nearest_neighbors_interpolate_kernel<<<
+      b, optimal_block_config(n, c), 0, at::cuda::getCurrentCUDAStream()>>>(
+      b, c, m, n, centers_features, indices, weights, out);
+  CUDA_CHECK_ERRORS();
+}
+/*
+  Function: interpolate three nearest neighbors (backward)
+  Args:
+    b   : batch size
+    c   : #channels of features
+    m   : number of query centers
+    n   : number of points in point clouds
+    grad_y  : grad of features of points, FloatTensor[b, c, n]
+    indices : indices of nearest 3 centers to the point, IntTensor[b, 3, n]
+    weights : weights for interpolation, FloatTensor[b, 3, n]
+    grad_x  : grad of features of centers, FloatTensor[b, c, m]
+*/
+__global__ void three_nearest_neighbors_interpolate_grad_kernel(
+    int b, int c, int n, int m, const float *__restrict__ grad_y,
+    const int *__restrict__ indices, const float *__restrict__ weights,
+    float *__restrict__ grad_x) {
+  int batch_index = blockIdx.x;
+  grad_y += batch_index * n * c;
+  indices += batch_index * n * 3;
+  weights += batch_index * n * 3;
+  grad_x += batch_index * m * c;
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * n; i += stride) {
+    const int l = i / n;
+    const int j = i % n;
+    float w1 = weights[j];
+    float w2 = weights[j + n];
+    float w3 = weights[j + n + n];
+    int i1 = indices[j];
+    int i2 = indices[j + n];
+    int i3 = indices[j + n + n];
+    atomicAdd(grad_x + l * m + i1, grad_y[i] * w1);
+    atomicAdd(grad_x + l * m + i2, grad_y[i] * w2);
+    atomicAdd(grad_x + l * m + i3, grad_y[i] * w3);
+  }
+}
+void three_nearest_neighbors_interpolate_grad(int b, int c, int n, int m,
+                                              const float *grad_y,
+                                              const int *indices,
+                                              const float *weights,
+                                              float *grad_x) {
+  three_nearest_neighbors_interpolate_grad_kernel<<<
+      b, optimal_block_config(n, c), 0, at::cuda::getCurrentCUDAStream()>>>(
+      b, c, n, m, grad_y, indices, weights, grad_x);
+  CUDA_CHECK_ERRORS();
+}

model/pvcnn/modules/functional/src/interpolate/neighbor_interpolate.cuh ADDED Viewed

	@@ -0,0 +1,16 @@

+#ifndef _NEIGHBOR_INTERPOLATE_CUH
+#define _NEIGHBOR_INTERPOLATE_CUH
+void three_nearest_neighbors_interpolate(int b, int c, int m, int n,
+                                         const float *points_coords,
+                                         const float *centers_coords,
+                                         const float *centers_features,
+                                         int *indices, float *weights,
+                                         float *out);
+void three_nearest_neighbors_interpolate_grad(int b, int c, int n, int m,
+                                              const float *grad_y,
+                                              const int *indices,
+                                              const float *weights,
+                                              float *grad_x);
+#endif

model/pvcnn/modules/functional/src/interpolate/neighbor_interpolate.hpp ADDED Viewed

	@@ -0,0 +1,16 @@

+#ifndef _NEIGHBOR_INTERPOLATE_HPP
+#define _NEIGHBOR_INTERPOLATE_HPP
+#include <torch/extension.h>
+#include <vector>
+std::vector<at::Tensor>
+three_nearest_neighbors_interpolate_forward(at::Tensor points_coords,
+                                            at::Tensor centers_coords,
+                                            at::Tensor centers_features);
+at::Tensor three_nearest_neighbors_interpolate_backward(at::Tensor grad_y,
+                                                        at::Tensor indices,
+                                                        at::Tensor weights,
+                                                        const int m);
+#endif