colin1842
/

S23DR-P2R

Model card Files Files and versions Community

colin1842 commited on May 29

Commit

8d5039c

•

1 Parent(s): 232f3bd

add model

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +21 -0
_script.py +145 -0
dataset/__pycache__/data_utils.cpython-38.pyc +0 -0
dataset/__pycache__/roofn3d_dataset.cpython-38.pyc +0 -0
dataset/data_utils.py +48 -0
dataset/roofn3d_dataset.py +235 -0
format_dataset.py +53 -0
hoho_train_checkpoint_epoch_90.pth +3 -0
model/__pycache__/cluster_refine.cpython-38.pyc +0 -0
model/__pycache__/edge_pred_net.cpython-38.pyc +0 -0
model/__pycache__/model_utils.cpython-38.pyc +0 -0
model/__pycache__/pointnet2.cpython-38.pyc +0 -0
model/__pycache__/pointnet_stack_utils.cpython-38.pyc +0 -0
model/__pycache__/pointnet_util.cpython-38.pyc +0 -0
model/__pycache__/roofnet.cpython-38.pyc +0 -0
model/cluster_refine.py +305 -0
model/edge_pred_net.py +173 -0
model/model_utils.py +156 -0
model/pointnet2.py +305 -0
model/pointnet_stack_utils.py +265 -0
model/pointnet_util.py +518 -0
model/roofnet.py +35 -0
model_cfg.yaml +26 -0
output/hoho_test/checkpoint_epoch_90_all.pth +3 -0
output/hoho_test/test/log.txt +0 -0
output/hoho_test/test/submission.parquet +3 -0
output/hoho_train/ckpt/checkpoint_epoch_41.pth +3 -0
output/hoho_train/ckpt/checkpoint_epoch_42.pth +3 -0
output/hoho_train/ckpt/checkpoint_epoch_43.pth +3 -0
output/hoho_train/ckpt/checkpoint_epoch_44.pth +3 -0
output/hoho_train/ckpt/checkpoint_epoch_45.pth +3 -0
output/hoho_train/log.txt +9 -0
pc_util/setup.py +23 -0
pc_util/src/ball_query.cpp +84 -0
pc_util/src/ball_query_gpu.cu +270 -0
pc_util/src/ball_query_gpu.h +38 -0
pc_util/src/cluster.cpp +50 -0
pc_util/src/cluster_gpu.cu +192 -0
pc_util/src/cluster_gpu.h +34 -0
pc_util/src/cuda_utils.h +15 -0
pc_util/src/group_points.cpp +98 -0
pc_util/src/group_points_gpu.cu +199 -0
pc_util/src/group_points_gpu.h +36 -0
pc_util/src/interpolate.cpp +148 -0
pc_util/src/interpolate_gpu.cu +343 -0
pc_util/src/interpolate_gpu.h +61 -0
pc_util/src/pointnet2_api.cpp +41 -0
pc_util/src/sampling.cpp +46 -0
pc_util/src/sampling_gpu.cu +259 -0
pc_util/src/sampling_gpu.h +29 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Weihang Li
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

_script.py ADDED Viewed

	@@ -0,0 +1,145 @@

+### This is example of the script that will be run in the test environment.
+### Some parts of the code are compulsory and you should NOT CHANGE THEM.
+### They are between '''---compulsory---''' comments.
+### You can change the rest of the code to define and test your solution.
+### However, you should not change the signature of the provided function.
+### The script would save "submission.parquet" file in the current directory.
+### The actual logic of the solution is implemented in the `handcrafted_solution.py` file.
+### The `handcrafted_solution.py` file is a placeholder for your solution.
+### You should implement the logic of your solution in that file.
+### You can use any additional files and subdirectories to organize your code.
+'''---compulsory---'''
+# import subprocess
+# from pathlib import Path
+# def install_package_from_local_file(package_name, folder='packages'):
+#     """
+#     Installs a package from a local .whl file or a directory containing .whl files using pip.
+#     Parameters:
+#     path_to_file_or_directory (str): The path to the .whl file or the directory containing .whl files.
+#     """
+#     try:
+#         pth = str(Path(folder) / package_name)
+#         subprocess.check_call([subprocess.sys.executable, "-m", "pip", "install",
+#                                "--no-index",  # Do not use package index
+#                                "--find-links", pth,  # Look for packages in the specified directory or at the file
+#                                package_name])  # Specify the package to install
+#         print(f"Package installed successfully from {pth}")
+#     except subprocess.CalledProcessError as e:
+#         print(f"Failed to install package from {pth}. Error: {e}")
+# install_package_from_local_file('hoho')
+import hoho; hoho.setup() # YOU MUST CALL hoho.setup() BEFORE ANYTHING ELSE
+# import subprocess
+# import importlib
+# from pathlib import Path
+# import subprocess
+# ### The function below is useful for installing additional python wheels.
+# def install_package_from_local_file(package_name, folder='packages'):
+#     """
+#     Installs a package from a local .whl file or a directory containing .whl files using pip.
+#     Parameters:
+#     path_to_file_or_directory (str): The path to the .whl file or the directory containing .whl files.
+#     """
+#     try:
+#         pth = str(Path(folder) / package_name)
+#         subprocess.check_call([subprocess.sys.executable, "-m", "pip", "install",
+#                                "--no-index",  # Do not use package index
+#                                "--find-links", pth,  # Look for packages in the specified directory or at the file
+#                                package_name])  # Specify the package to install
+#         print(f"Package installed successfully from {pth}")
+#     except subprocess.CalledProcessError as e:
+#         print(f"Failed to install package from {pth}. Error: {e}")
+# pip download webdataset -d packages/webdataset --platform manylinux1_x86_64 --python-version 38 --only-binary=:all:
+# install_package_from_local_file('webdataset')
+# install_package_from_local_file('tqdm')
+### Here you can import any library or module you want.
+### The code below is used to read and parse the input dataset.
+### Please, do not modify it.
+import webdataset as wds
+from tqdm import tqdm
+from typing import Dict
+import pandas as pd
+from transformers import AutoTokenizer
+import os
+import time
+import io
+from PIL import Image as PImage
+import numpy as np
+from hoho.read_write_colmap import read_cameras_binary, read_images_binary, read_points3D_binary
+from hoho import proc, Sample
+def convert_entry_to_human_readable(entry):
+    out = {}
+    already_good = ['__key__', 'wf_vertices', 'wf_edges', 'edge_semantics', 'mesh_vertices', 'mesh_faces', 'face_semantics', 'K', 'R', 't']
+    for k, v in entry.items():
+        if k in already_good:
+            out[k] = v
+            continue
+        if k == 'points3d':
+            out[k] = read_points3D_binary(fid=io.BytesIO(v))
+        if k == 'cameras':
+            out[k] = read_cameras_binary(fid=io.BytesIO(v))
+        if k == 'images':
+            out[k] = read_images_binary(fid=io.BytesIO(v))
+        if k in ['ade20k', 'gestalt']:
+            out[k] =  [PImage.open(io.BytesIO(x)).convert('RGB') for x in v]
+        if k == 'depthcm':
+            out[k] = [PImage.open(io.BytesIO(x)) for x in entry['depthcm']]
+    return out
+'''---end of compulsory---'''
+### The part below is used to define and test your solution.
+from pathlib import Path
+def save_submission(submission, path):
+    """
+    Saves the submission to a specified path.
+    Parameters:
+    submission (List[Dict[]]): The submission to save.
+    path (str): The path to save the submission to.
+    """
+    sub = pd.DataFrame(submission, columns=["__key__", "wf_vertices", "wf_edges"])
+    sub.to_parquet(path)
+    print(f"Submission saved to {path}")
+if __name__ == "__main__":
+    from handcrafted_solution import predict
+    print ("------------ Loading dataset------------ ")
+    params = hoho.get_params()
+    dataset = hoho.get_dataset(decode=None, split='all', dataset_type='webdataset')
+    print('------------ Now you can do your solution ---------------')
+    solution = []
+    from concurrent.futures import ProcessPoolExecutor
+    with ProcessPoolExecutor(max_workers=8) as pool:
+        results = []
+        for i, sample in enumerate(tqdm(dataset)):
+            results.append(pool.submit(predict, sample, visualize=False))
+        for i, result in enumerate(tqdm(results)):
+            key, pred_vertices, pred_edges = result.result()
+            solution.append({
+                            '__key__': key,
+                            'wf_vertices': pred_vertices.tolist(),
+                            'wf_edges': pred_edges
+                        })
+            if i % 100 == 0:
+                # incrementally save the results in case we run out of time
+                print(f"Processed {i} samples")
+                # save_submission(solution, Path(params['output_path']) / "submission.parquet")
+    print('------------ Saving results ---------------')
+    save_submission(solution, Path(params['output_path']) / "submission.parquet")
+    print("------------ Done ------------ ")

dataset/__pycache__/data_utils.cpython-38.pyc ADDED Viewed

Binary file (1.5 kB). View file

dataset/__pycache__/roofn3d_dataset.cpython-38.pyc ADDED Viewed

Binary file (6.38 kB). View file

dataset/data_utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from torch.utils.data import DataLoader
+#from .roofn3d_dataset import RoofN3dDataset
+from dataset.roofn3d_dataset import RoofN3dDataset, HohoDataset
+import numpy as np
+import random
+__all__ = {
+    'RoofN3dDataset': RoofN3dDataset
+}
+class GaussianTransform:
+    def __init__(self, sigma = (0.005, 0.015), clip = 0.05, p = 0.8):
+        self.sigma = sigma
+        self.clip = clip
+        self.p = p
+    def __call__(self, points):
+        if np.random.rand(1) < self.p:
+            lastsigma = np.random.rand(1) * (self.sigma[1] - self.sigma[0]) + self.sigma[0]
+            row, Col = points.shape
+            jittered_point = np.clip(lastsigma * np.random.randn(row, Col), -1 * self.clip, self.clip)
+            jittered_point += points
+            return jittered_point
+        else:
+            return points
+def build_dataloader(key, xyz, batch_size, data_cfg, workers=1, logger=None):
+    trasform = GaussianTransform(sigma= (0.005, 0.010), clip = 10, p = 0.0)
+    dataset = HohoDataset(key, xyz, trasform, data_cfg, logger)
+    dataloader = DataLoader(
+        dataset, batch_size=batch_size, pin_memory=True, num_workers=workers, collate_fn=dataset.collate_batch,
+        shuffle=False)
+    return dataloader
+# def build_dataloader(path, batch_size, data_cfg, workers=16, logger=None, training=True):
+#     path += '/train.txt' if training else '/test.txt'
+#     if training:
+#         trasform = GaussianTransform(sigma=(0.005, 0.010), clip = 10, p = 0.8)
+#     else:
+#         trasform = GaussianTransform(sigma= (0.005, 0.010), clip = 10, p = 0.0)
+#     dataset = RoofN3dDataset(path, trasform, data_cfg, logger)
+#     dataloader = DataLoader(
+#         dataset, batch_size=batch_size, pin_memory=True, num_workers=workers, collate_fn=dataset.collate_batch,
+#         shuffle=training)
+#     return dataloader

dataset/roofn3d_dataset.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import numpy as np
+from torch.utils.data import Dataset
+from collections import defaultdict
+import os
+import shutil
+def read_pts(pts_file):
+    with open(pts_file, 'r') as f:
+        lines = f.readlines()
+        pts = np.array([f.strip().split(' ') for f in lines], dtype=np.float64)
+    return pts
+# def load_obj(obj_file):
+#     vs, edges = [], set()
+#     with open(obj_file, 'r') as f:
+#         lines = f.readlines()
+#     for f in lines:
+#         vals = f.strip().split(' ')
+#         if vals[0] == 'v':
+#             vs.append(vals[1:])
+#         else:
+#             obj_data = np.array(vals[1:], dtype=int).reshape(-1, 1) - 1
+#             idx = np.arange(len(obj_data)) - 1
+#             cur_edge = np.concatenate([obj_data, obj_data[idx]], -1)
+#             [edges.add(tuple(sorted(e))) for e in cur_edge]
+#     vs = np.array(vs, dtype=np.float64)
+#     edges = np.array(list(edges))
+#     return vs, edges
+def load_obj(obj_file):
+    vs, edges = [], set()
+    with open(obj_file, 'r') as f:
+        lines = f.readlines()
+    for line in lines:
+        vals = line.strip().split(' ')
+        if vals[0] == 'v':
+            vs.append([float(coord) for coord in vals[1:]])
+        elif vals[0] == 'l':
+            vertex_indices = [int(idx) - 1 for idx in vals[1:]]  # Convert to zero-based index
+            for i in range(len(vertex_indices) - 1):
+                edge = tuple(sorted((vertex_indices[i], vertex_indices[i + 1])))
+                edges.add(edge)
+    vs = np.array(vs, dtype=np.float64)
+    edges = np.array(list(edges), dtype=np.int32)
+    return vs, edges
+def writePoints(points, clsRoad):
+    with open(clsRoad, 'w+') as file1:
+        for i in range(len(points)):
+            point = points[i]
+            file1.write(str(point[0]))
+            file1.write(' ')
+            file1.write(str(point[1]))
+            file1.write(' ')
+            file1.write(str(point[2]))
+            file1.write(' ')
+            file1.write('\n')
+class RoofN3dDataset(Dataset):
+    def __init__(self, data_path, transform, data_cfg, logger=None):
+        with open(data_path, 'r') as f:
+            self.file_list = f.readlines()
+        self.file_list = [f.strip() for f in self.file_list]
+        flist = []
+        for l in self.file_list:
+             flist.append(l)
+        self.file_list = flist
+        self.npoint = data_cfg.NPOINT
+        self.transform = transform
+        if logger is not None:
+            logger.info('Total samples: %d' % len(self))
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, item):
+        file_path = self.file_list[item]
+        frame_id = file_path.split('/')[-1]
+        points = read_pts(file_path + '/points.xyz')
+        points = self.transform(points)
+        if len(points) > self.npoint:
+            idx = np.random.randint(0, len(points), self.npoint)
+        else:
+            idx = np.random.randint(0, len(points), self.npoint - len(points))
+            idx = np.append(np.arange(0, len(points)), idx)
+        np.random.shuffle(idx)
+        points = points[idx]
+        vectors, edges = load_obj(self.file_list[item] + '/polygon.obj')
+        min_pt, max_pt = np.min(points, axis=0), np.max(points, axis=0)
+        maxXYZ = np.max(max_pt)
+        minXYZ = np.min(min_pt)
+        min_pt[:] = minXYZ
+        max_pt[:] = maxXYZ
+        points = (points - min_pt) / (max_pt - min_pt)
+        vectors = (vectors - min_pt) / (max_pt - min_pt)
+        points = points.astype(np.float32)
+        vectors = vectors.astype(np.float32)
+        min_pt = min_pt.astype(np.float32)
+        max_pt = max_pt.astype(np.float32)
+        pt = np.concatenate(( np.expand_dims(min_pt, 0),  np.expand_dims(max_pt, 0)), axis = 0)
+        data_dict = {'points': points, 'vectors': vectors, 'edges': edges, 'frame_id': frame_id, 'minMaxPt': pt}
+        return data_dict
+    @staticmethod
+    def collate_batch(batch_list, _unused=False):
+        data_dict = defaultdict(list)
+        for cur_sample in batch_list:
+            for key, val in cur_sample.items():
+                data_dict[key].append(val)
+        batch_size = len(batch_list)
+        ret = {}
+        for key, val in data_dict.items():
+            try:
+                if key == 'points':
+                    ret[key] = np.concatenate(val, axis=0).reshape([batch_size, -1, val[0].shape[-1]])
+                elif key in ['vectors', 'edges']:
+                    max_vec = max([len(x) for x in val])
+                    batch_vecs = np.ones((batch_size, max_vec, val[0].shape[-1]), dtype=np.float32) * -1e1
+                    for k in range(batch_size):
+                        batch_vecs[k, :val[k].__len__(), :] = val[k]
+                    ret[key] = batch_vecs
+                elif key in ['frame_id']:
+                    ret[key] = val
+                elif key in ['minMaxPt']:
+                    ret[key] = val
+                else:
+                    ret[key] = np.stack(val, axis=0)
+            except:
+                print('Error in collate_batch: key=%s' % key)
+                raise TypeError
+        ret['batch_size'] = batch_size
+        return ret
+class HohoDataset(Dataset):
+    def __init__(self, key, xyz, transform, data_cfg, logger=None):
+        self.npoint = data_cfg.NPOINT
+        self.frame_id = key
+        self.xyz = xyz
+        self.transform = transform
+        if logger is not None:
+            logger.info('Total samples: %d' % len(self))
+    def __len__(self):
+        return 1
+    def __getitem__(self, item):
+        frame_id = self.frame_id
+        # points = read_pts(file_path + '/points.xyz')
+        # points = self.transform(points)
+        points = self.xyz
+        if len(points) > self.npoint:
+            idx = np.random.randint(0, len(points), self.npoint)
+        else:
+            idx = np.random.randint(0, len(points), self.npoint - len(points))
+            idx = np.append(np.arange(0, len(points)), idx)
+        np.random.shuffle(idx)
+        points = points[idx]
+        # vectors, edges = load_obj(self.file_list[item] + '/polygon.obj')
+        min_pt, max_pt = np.min(points, axis=0), np.max(points, axis=0)
+        maxXYZ = np.max(max_pt)
+        minXYZ = np.min(min_pt)
+        min_pt[:] = minXYZ
+        max_pt[:] = maxXYZ
+        points = (points - min_pt) / (max_pt - min_pt)
+        # vectors = (vectors - min_pt) / (max_pt - min_pt)
+        points = points.astype(np.float32)
+        # vectors = vectors.astype(np.float32)
+        min_pt = min_pt.astype(np.float32)
+        max_pt = max_pt.astype(np.float32)
+        pt = np.concatenate(( np.expand_dims(min_pt, 0),  np.expand_dims(max_pt, 0)), axis = 0)
+        data_dict = {'points': points, 'vectors': None, 'edges': None, 'frame_id': frame_id, 'minMaxPt': pt}
+        return data_dict
+    @staticmethod
+    def collate_batch(batch_list, _unused=False):
+        data_dict = defaultdict(list)
+        for cur_sample in batch_list:
+            for key, val in cur_sample.items():
+                data_dict[key].append(val)
+        batch_size = len(batch_list)
+        ret = {}
+        for key, val in data_dict.items():
+            try:
+                if key == 'points':
+                    ret[key] = np.concatenate(val, axis=0).reshape([batch_size, -1, val[0].shape[-1]])
+                elif key in ['vectors', 'edges']:
+                    continue
+                    max_vec = max([len(x) for x in val])
+                    batch_vecs = np.ones((batch_size, max_vec, val[0].shape[-1]), dtype=np.float32) * -1e1
+                    for k in range(batch_size):
+                        batch_vecs[k, :val[k].__len__(), :] = val[k]
+                    # ret[key] = batch_vecs
+                    ret[key] = None
+                elif key in ['frame_id']:
+                    ret[key] = val
+                elif key in ['minMaxPt']:
+                    ret[key] = val
+                else:
+                    ret[key] = np.stack(val, axis=0)
+            except:
+                print('Error in collate_batch: key=%s' % key)
+                raise TypeError
+        ret['batch_size'] = batch_size
+        return ret

format_dataset.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import shutil
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def create_directory(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+def transfer_files(source_clean_xyz_dir, source_gt_dir, destination_dir, txt_file_path):
+    create_directory(destination_dir)
+    subdirectory_paths = []
+    for filename in os.listdir(source_clean_xyz_dir):
+        if filename.endswith('.xyz'):
+            base_name = os.path.splitext(filename)[0]
+            new_subdir = os.path.join(destination_dir, base_name)
+            create_directory(new_subdir)
+            subdirectory_paths.append(new_subdir)
+            source_xyz = os.path.join(source_clean_xyz_dir, filename)
+            destination_xyz = os.path.join(new_subdir, 'points.xyz')
+            shutil.copy(source_xyz, destination_xyz)
+            logging.info(f'Copied {source_xyz} to {destination_xyz}')
+            source_obj = os.path.join(source_gt_dir, f'{base_name}.obj')
+            destination_obj = os.path.join(new_subdir, 'polygon.obj')
+            if os.path.exists(source_obj):
+                shutil.copy(source_obj, destination_obj)
+                logging.info(f'Copied {source_obj} to {destination_obj}')
+            else:
+                logging.warning(f'File not found: {source_obj}')
+    with open(txt_file_path, 'w') as txt_file:
+        for path in subdirectory_paths:
+            txt_file.write(path + '\n')
+    logging.info(f'Written subdirectory paths to {txt_file_path}')
+# Define paths
+source_clean_xyz_dir = 'Data/hoho_data_train/clean_xyz'
+source_gt_dir = 'Data/hoho_data_train/gt'
+destination_dir = 'Data/hoho_data_train'
+txt_file_path = 'Data/hoho_data_train/train.txt'
+# Run the transfer process
+transfer_files(source_clean_xyz_dir, source_gt_dir, destination_dir, txt_file_path)

hoho_train_checkpoint_epoch_90.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41b6815747720660fdd68b6272fa3bae40b4f09095b6ddf14540f7f26ee284fb
+size 17019805

model/__pycache__/cluster_refine.cpython-38.pyc ADDED Viewed

Binary file (9.14 kB). View file

model/__pycache__/edge_pred_net.cpython-38.pyc ADDED Viewed

Binary file (5.32 kB). View file

model/__pycache__/model_utils.cpython-38.pyc ADDED Viewed

Binary file (5.88 kB). View file

model/__pycache__/pointnet2.cpython-38.pyc ADDED Viewed

Binary file (10.6 kB). View file

model/__pycache__/pointnet_stack_utils.cpython-38.pyc ADDED Viewed

Binary file (8.86 kB). View file

model/__pycache__/pointnet_util.cpython-38.pyc ADDED Viewed

Binary file (10.7 kB). View file

model/__pycache__/roofnet.cpython-38.pyc ADDED Viewed

Binary file (1.32 kB). View file

model/cluster_refine.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from .pointnet_stack_utils import *
+from .model_utils import *
+from scipy.optimize import linear_sum_assignment
+from utils import loss_utils
+import pc_util
+class ClusterRefineNet(nn.Module):
+    def __init__(self, model_cfg, input_channel):
+        super().__init__()
+        self.model_cfg = model_cfg
+        self.matcher = HungarianMatcher(self.model_cfg.MatchRadius)
+        sa_cfg = model_cfg.RefineSA
+        mlps = sa_cfg.MLPs
+        mlps = [[input_channel] + mlp for mlp in mlps]
+        self.fea_refine_module = StackSAModuleMSG(
+                radii=sa_cfg.Radii,
+                nsamples=sa_cfg.Nsamples,
+                mlps=mlps,
+                use_xyz=True,
+                pool_method='max_pool'
+            )
+        self.num_output_feature = sum([mlp[-1]for mlp in mlps])
+        self.shared_fc = LinearBN(256, 128)
+        self.drop = nn.Dropout(0.5)
+        self.offset_fc = nn.Linear(128, 3)
+        # self.cls_fc = nn.Linear(128, 1)
+        if self.training:
+            self.train_dict = {}
+            # self.add_module(
+            #     'cls_loss_func',
+            #     loss_utils.SigmoidBCELoss()
+            # )
+            self.add_module(
+                'reg_loss_func',
+                loss_utils.WeightedSmoothL1Loss()
+            )
+            self.loss_weight = self.model_cfg.LossWeight
+        self.init_weights()
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            if isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1.0)
+                nn.init.constant_(m.bias, 0)
+# tips: change from batch to stack
+    def forward(self, batch_dict):
+        offset_pts = batch_dict['points'].clone()
+        offset = batch_dict['point_pred_offset']
+        pts_score = batch_dict['point_pred_score']
+        score_thresh = self.model_cfg.ScoreThresh
+        offset_pts[pts_score > score_thresh] += offset[pts_score > score_thresh]
+        pts_cluster = offset_pts.new_ones(offset_pts.shape) * -10
+        pts_cluster[pts_score > score_thresh] = offset_pts[pts_score > score_thresh]
+        cluster_idx = dbscan_cluster(self.model_cfg.Cluster.eps, self.model_cfg.Cluster.min_pts, pts_cluster)
+        key_pts, num_cluster = get_cluster_pts(pts_cluster, cluster_idx)
+        if self.training:
+             new_pts, targets, labels, matches, new_xyz_batch_cnt = self.matcher(key_pts, batch_dict['vectors'])
+             offset_targets = (targets - new_pts) / self.model_cfg.MatchRadius if new_pts is not None else None
+             batch_dict['matches'] = matches
+             self.train_dict.update({
+                 'keypoint_cls_label': labels,
+                 'keypoint_offset_label': offset_targets
+             })
+        else:
+            pts_list, new_xyz_batch_cnt = [], []
+            for i, pts in enumerate(key_pts):
+                pts = pts[torch.sum(pts, -1) > -2e1]
+                if len(pts) == 0:
+                    new_xyz_batch_cnt.append(0)
+                    continue
+                new_xyz_batch_cnt.append(len(pts))
+                pts_list.append(pts)
+            if sum(new_xyz_batch_cnt) == 0:
+                new_pts, new_xyz_batch_cnt = None, None
+            else:
+                new_pts = torch.cat(pts_list, 0)
+                new_xyz_batch_cnt = new_pts.new_tensor(new_xyz_batch_cnt, dtype=torch.int32)
+        if new_pts is None:
+            exit()
+        batch_idx = torch.zeros(new_pts.shape[0], device=new_pts.device)
+        idx = 0
+        for i, cnt in enumerate(new_xyz_batch_cnt):
+            if cnt == 0:
+                continue
+            batch_idx[idx:idx + cnt] = i
+            idx += cnt
+        pos_mask = new_xyz_batch_cnt > 0
+        offset_pts = offset_pts[pos_mask]
+        xyz = offset_pts.view(-1, 3)
+        xyz_batch_cnt = offset_pts.new_ones(offset_pts.shape[0], dtype=torch.int32) * offset_pts.shape[1]
+        new_xyz_batch_cnt = new_xyz_batch_cnt[pos_mask]
+        point_fea = batch_dict['point_features']
+        point_fea = point_fea * pts_score.detach().unsqueeze(-1)
+        point_fea = point_fea[pos_mask]
+        point_fea = point_fea.contiguous().view(-1, point_fea.shape[-1])
+        _, refine_fea = self.fea_refine_module(xyz, xyz_batch_cnt, new_pts, new_xyz_batch_cnt, point_fea)
+        x = self.drop(self.shared_fc(refine_fea))
+        pred_offset = self.offset_fc(x)
+        # pred_cls = self.cls_fc(x)
+        if self.training:
+            self.train_dict.update({
+                # 'keypoint_cls_pred': pred_cls,
+                'keypoint_offset_pred': pred_offset
+            })
+        batch_dict['keypoint'] = torch.cat([batch_idx.view(-1, 1), new_pts], -1)
+        batch_dict['keypoint_features'] = refine_fea
+        # batch_dict['keypoint_pred_score'] = torch.sigmoid(pred_cls).squeeze(-1)
+        batch_dict['refined_keypoint'] = pred_offset * self.model_cfg.MatchRadius + new_pts
+        return batch_dict
+    def loss(self, loss_dict, disp_dict):
+        # pred_cls, pred_offset = self.train_dict['keypoint_cls_pred'], self.train_dict['keypoint_offset_pred']
+        pred_offset = self.train_dict['keypoint_offset_pred']
+        label_cls, label_offset = self.train_dict['keypoint_cls_label'], self.train_dict['keypoint_offset_label']
+        # cls_loss = self.get_cls_loss(pred_cls, label_cls, self.loss_weight['cls_weight'])
+        reg_loss = self.get_reg_loss(pred_offset, label_offset, label_cls, self.loss_weight['reg_weight'])
+        loss = reg_loss
+        # loss = cls_loss + reg_loss
+        loss_dict.update({
+            # 'refine_cls_loss': cls_loss.item(),
+            'refine_offset_loss': reg_loss.item(),
+            'refine_loss': loss.item()
+        })
+        # pred_cls = pred_cls.squeeze(-1)
+        # label_cls = label_cls.squeeze(-1)
+        # pred_logit = torch.sigmoid(pred_cls)
+        # pred = torch.where(pred_logit >= 0.5, pred_logit.new_ones(pred_logit.shape),
+        #                    pred_logit.new_zeros(pred_logit.shape))
+        # acc = torch.sum((pred == label_cls) & (label_cls == 1)).item() / torch.sum(label_cls == 1).item()
+        # disp_dict.update({'pos_acc': acc})
+        return loss, loss_dict, disp_dict
+    def get_cls_loss(self, pred, label, weight):
+        batch_size = int(pred.shape[0])
+        positives = label > 0
+        negatives = label == 0
+        cls_weights = (negatives * 1.0 + positives * 1.0).float()
+        pos_normalizer = positives.sum(1, keepdim=True).float()
+        cls_weights /= torch.clamp(pos_normalizer, min=1.0)
+        cls_loss_src = self.cls_loss_func(pred.squeeze(-1), label, weights=cls_weights)  # [N, M]
+        cls_loss = cls_loss_src.sum() / batch_size
+        cls_loss = cls_loss * weight
+        return cls_loss
+    def get_reg_loss(self, pred, label, cls_label, weight):
+        positives = cls_label > 0
+        reg_weights = positives.float()
+        pos_normalizer = positives.sum().float()
+        reg_weights /= torch.clamp(pos_normalizer, min=1.0)
+        reg_loss_src = self.reg_loss_func(pred.unsqueeze(dim=0), label.unsqueeze(dim=0), weights=reg_weights.unsqueeze(dim=0))
+        reg_loss = reg_loss_src.sum()
+        reg_loss = reg_loss * weight
+        return reg_loss
+class StackSAModuleMSG(nn.Module):
+    def __init__(self, radii, nsamples, mlps, use_xyz, pool_method='max_pool'):
+        """
+        Args:
+            radii: list of float, list of radii to group with
+            nsamples: list of int, number of samples in each ball query
+            mlps: list of list of int, spec of the pointnet before the global pooling for each scale
+            use_xyz:
+            pool_method: max_pool / avg_pool
+        """
+        super().__init__()
+        assert len(radii) == len(nsamples) == len(mlps)
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(QueryAndGroup(radius, nsample, use_xyz=use_xyz))
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+            shared_mlps = []
+            for k in range(len(mlp_spec) - 1):
+                shared_mlps.extend([
+                    nn.Conv2d(mlp_spec[k], mlp_spec[k + 1], kernel_size=1, bias=False),
+                    nn.BatchNorm2d(mlp_spec[k + 1]),
+                    nn.ReLU()
+                ])
+            self.mlps.append(nn.Sequential(*shared_mlps))
+        self.pool_method = pool_method
+        self.init_weights()
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            if isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1.0)
+                nn.init.constant_(m.bias, 0)
+    def forward(self, xyz, xyz_batch_cnt, new_xyz, new_xyz_batch_cnt, features=None, empty_voxel_set_zeros=True):
+        """
+        :param xyz: (N1 + N2 ..., 3) tensor of the xyz coordinates of the features
+        :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
+        :param new_xyz: (M1 + M2 ..., 3)
+        :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
+        :param features: (N1 + N2 ..., C) tensor of the descriptors of the the features
+        :return:
+            new_xyz: (M1 + M2 ..., 3) tensor of the new features' xyz
+            new_features: (M1 + M2 ..., \sum_k(mlps[k][-1])) tensor of the new_features descriptors
+        """
+        new_features_list = []
+        for k in range(len(self.groupers)):
+            new_features, ball_idxs = self.groupers[k](
+                xyz, xyz_batch_cnt, new_xyz, new_xyz_batch_cnt, features
+            )  # (M1 + M2, C, nsample)
+            new_features = new_features.permute(1, 0, 2).unsqueeze(dim=0)  # (1, C, M1 + M2 ..., nsample)
+            new_features = self.mlps[k](new_features)  # (1, C, M1 + M2 ..., nsample)
+            if self.pool_method == 'max_pool':
+                new_features = F.max_pool2d(
+                    new_features, kernel_size=[1, new_features.size(3)]
+                ).squeeze(dim=-1)  # (1, C, M1 + M2 ...)
+            elif self.pool_method == 'avg_pool':
+                new_features = F.avg_pool2d(
+                    new_features, kernel_size=[1, new_features.size(3)]
+                ).squeeze(dim=-1)  # (1, C, M1 + M2 ...)
+            else:
+                raise NotImplementedError
+            new_features = new_features.squeeze(dim=0).permute(1, 0)  # (M1 + M2 ..., C)
+            new_features_list.append(new_features)
+        new_features = torch.cat(new_features_list, dim=1)  # (M1 + M2 ..., C)
+        return new_xyz, new_features
+class HungarianMatcher(nn.Module):
+    def __init__(self, match_r):
+        super().__init__()
+        self.dist_thresh = match_r
+    # tips: matcher with dist threshold
+    @torch.no_grad()
+    def forward(self, output, targets):
+        pts_list, target_list, label_list, match_list, new_xyz_batch_cnt = [], [], [], [], []
+        for i in range(output.shape[0]):
+            tmp_output, tmp_targets = output[i], targets[i]
+            tmp_output = tmp_output[torch.sum(tmp_output, -1) > -2e1]
+            if len(tmp_output) == 0:
+                new_xyz_batch_cnt.append(0)
+                continue
+            tmp_targets = tmp_targets[torch.sum(tmp_targets, -1) > -2e1]
+            vec_a = torch.sum(tmp_output.unsqueeze(1).repeat(1, tmp_targets.shape[0], 1) ** 2, -1)
+            vec_b = torch.sum(tmp_targets.unsqueeze(0).repeat(tmp_output.shape[0], 1, 1) ** 2, -1)
+            dist_matrix = vec_a + vec_b - 2 * torch.mm(tmp_output, tmp_targets.permute(1, 0))
+            dist_matrix = F.relu(dist_matrix)
+            dist_matrix = torch.sqrt(dist_matrix)
+            out_ind, tar_ind = linear_sum_assignment(dist_matrix.cpu().numpy())
+            out_ind, tar_ind = dist_matrix.new_tensor(out_ind, dtype=torch.int64), dist_matrix.new_tensor(tar_ind, dtype=torch.int64)
+            dist_val = dist_matrix[out_ind, tar_ind]
+            out_ind = out_ind[dist_val < self.dist_thresh]
+            tar_ind = tar_ind[dist_val < self.dist_thresh]
+            pts_list.append(tmp_output)
+            tmp_label = tmp_targets.new_zeros(tmp_output.shape[0])
+            tmp_label[out_ind] = 1.
+            tmp_pts_target = tmp_targets.new_zeros(tmp_output.shape)
+            tmp_pts_target[out_ind] = tmp_targets[tar_ind]
+            tmp_match = tmp_targets.new_ones(tmp_output.shape[0], dtype=torch.int64) * -1
+            tmp_match[out_ind] = tar_ind
+            label_list.append(tmp_label)
+            target_list.append(tmp_pts_target)
+            match_list.append(tmp_match)
+            new_xyz_batch_cnt.append(tmp_output.shape[0])
+        if sum(new_xyz_batch_cnt) == 0:
+            return None, None, None, None, None
+        return torch.cat(pts_list, 0), torch.cat(target_list, 0), torch.cat(label_list, 0), torch.cat(match_list, 0), tmp_output.new_tensor(new_xyz_batch_cnt, dtype=torch.int32)

model/edge_pred_net.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from .pointnet_stack_utils import *
+from .model_utils import *
+from scipy.optimize import linear_sum_assignment
+from utils import loss_utils
+import pc_util
+import itertools
+class EdgeAttentionNet(nn.Module):
+    def __init__(self, model_cfg, input_channel):
+        super().__init__()
+        self.model_cfg = model_cfg
+        self.freeze = False
+        self.att_layer = PairedPointAttention(input_channel)
+        num_feature = self.att_layer.num_output_feature
+        self.shared_fc = LinearBN(num_feature, num_feature)
+        self.drop = nn.Dropout(0.5)
+        self.cls_fc = nn.Linear(num_feature, 1)
+        if self.training:
+            self.train_dict = {}
+            self.add_module(
+                'cls_loss_func',
+                loss_utils.SigmoidBCELoss()
+            )
+            self.loss_weight = self.model_cfg.LossWeight
+        self.init_weights()
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            if isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1.0)
+                nn.init.constant_(m.bias, 0)
+    def forward(self, batch_dict):
+        batch_idx = batch_dict['keypoint'][:, 0]
+        point_fea = batch_dict['keypoint_features']
+        if self.training:
+            matches = batch_dict['matches']
+            edge_label = batch_dict['edges']
+            bin_label_list = []
+            for i, edge in enumerate(edge_label):
+                mask = batch_idx == i
+                tmp_idx = batch_idx[mask]
+                if tmp_idx.shape[0] <= 1:
+                    continue
+                match = matches[mask]
+                match_edge = list(itertools.combinations(match.cpu().numpy(), 2))
+                match_edge = [tuple(sorted(e)) for e in match_edge]
+                edge = [tuple(e) for e in edge.cpu().numpy()]
+                label = edge_label.new_tensor([e in edge for e in match_edge])
+                bin_label_list.append(label)
+            self.train_dict['label'] = torch.cat(bin_label_list)
+        idx = 0
+        pair_idx_list = []
+        pair_idx_list1, pair_idx_list2 = [], []
+        for i in range(batch_dict['batch_size']):
+            mask = batch_idx == i
+            tmp_idx = batch_idx[mask]
+            if tmp_idx.shape[0] <= 1:
+                continue
+            fea = point_fea[mask]
+            pair_idx = itertools.combinations(range(fea.shape[0]), 2)
+            pair_idx = point_fea.new_tensor(list(pair_idx))
+            pair_idx_list.append(pair_idx)
+            pair_idx_list1.append(pair_idx[:, 0] + idx)
+            pair_idx_list2.append(pair_idx[:, 1] + idx)
+            idx += tmp_idx.shape[0]
+        print('pair_idx_list:', pair_idx_list)
+        if pair_idx_list1 and pair_idx_list2:
+            pair_idx1 = torch.cat(pair_idx_list1).long()
+            pair_idx2 = torch.cat(pair_idx_list2).long()
+            pair_fea1 = point_fea[pair_idx1]
+            pair_fea2 = point_fea[pair_idx2]
+            edge_fea = self.att_layer(pair_fea1, pair_fea2)
+            edge_pred = self.cls_fc(self.drop(self.shared_fc(edge_fea)))
+            batch_dict['pair_points'] = torch.cat(pair_idx_list, 0)
+            batch_dict['edge_score'] = torch.sigmoid(edge_pred).view(-1)
+            if self.training:
+                self.train_dict['edge_pred'] = edge_pred
+        else:
+            print("Warning: pair_idx_list1 or pair_idx_list2 is empty!")
+            batch_dict['pair_points'] = torch.tensor([])
+            batch_dict['edge_score'] = torch.tensor([])
+            if self.training:
+                self.train_dict['edge_pred'] = edge_pred
+        return batch_dict
+    def loss(self, loss_dict, disp_dict):
+        pred_cls = self.train_dict['edge_pred']
+        label_cls = self.train_dict['label']
+        cls_loss = self.get_cls_loss(pred_cls, label_cls, self.loss_weight['cls_weight'])
+        loss = cls_loss
+        loss_dict.update({
+            'edge_cls_loss': cls_loss.item(),
+            'edge_loss': loss.item()
+        })
+        pred_cls = pred_cls.squeeze(-1)
+        label_cls = label_cls.squeeze(-1)
+        pred_logit = torch.sigmoid(pred_cls)
+        pred = torch.where(pred_logit >= 0.5, pred_logit.new_ones(pred_logit.shape),
+                           pred_logit.new_zeros(pred_logit.shape))
+        acc = torch.sum((pred == label_cls) & (label_cls == 1)).item() / torch.sum(label_cls == 1).item()
+        #acc = torch.sum((pred == label_cls)).item() / len(label_cls.view(-1))
+        disp_dict.update({'edge_acc': acc})
+        return loss, loss_dict, disp_dict
+    def get_cls_loss(self, pred, label, weight):
+        positives = label > 0
+        negatives = label == 0
+        cls_weights = (negatives * 1.0 + positives * 1.0).float()
+        pos_normalizer = positives.sum().float()
+        cls_weights /= torch.clamp(pos_normalizer, min=1.0)
+        cls_loss_src = self.cls_loss_func(pred.squeeze(-1), label, weights=cls_weights)  # [N, M]
+        cls_loss = cls_loss_src.sum()
+        cls_loss = cls_loss * weight
+        return cls_loss
+class PairedPointAttention(nn.Module):
+    def __init__(self, input_channel):
+        super().__init__()
+        self.edge_att1 = nn.Sequential(
+            nn.Linear(input_channel, input_channel),
+            nn.BatchNorm1d(input_channel),
+            nn.ReLU(),
+            nn.Linear(input_channel, input_channel),
+            nn.Sigmoid(),
+        )
+        self.edge_att2 = nn.Sequential(
+            nn.Linear(input_channel, input_channel),
+            nn.BatchNorm1d(input_channel),
+            nn.ReLU(),
+            nn.Linear(input_channel, input_channel),
+            nn.Sigmoid(),
+        )
+        self.fea_fusion_layer = nn.MaxPool1d(2)
+        self.num_output_feature = input_channel
+    def forward(self, point_fea1, point_fea2):
+        fusion_fea = point_fea1 + point_fea2
+        att1 = self.edge_att1(fusion_fea)
+        att2 = self.edge_att2(fusion_fea)
+        att_fea1 = point_fea1 * att1
+        att_fea2 = point_fea2 * att2
+        fea = torch.cat([att_fea1.unsqueeze(1), att_fea2.unsqueeze(1)], 1)
+        fea = self.fea_fusion_layer(fea.permute(0, 2, 1)).squeeze(-1)
+        return fea

model/model_utils.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pc_util
+from torch.autograd import Function, Variable
+class Conv2ds(nn.Sequential):
+    def __init__(self, cns):
+        super().__init__()
+        for i in range(len(cns) - 1):
+            in_cn, out_cn = cns[i], cns[i + 1]
+            self.add_module('conv%d' % (i + 1), Conv2dBN(in_cn, out_cn))
+class Conv2dBN(nn.Module):
+    def __init__(self, in_channel, out_channel):
+        super().__init__()
+        self.bn = nn.BatchNorm2d(out_channel)
+        self.conv = nn.Conv2d(in_channel, out_channel, 1)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x), inplace=True))
+class Conv1ds(nn.Sequential):
+    def __init__(self, cns):
+        super().__init__()
+        for i in range(len(cns) - 1):
+            in_cn, out_cn = cns[i], cns[i + 1]
+            self.add_module('conv%d' % (i + 1), Conv1dBN(in_cn, out_cn))
+class Conv1dBN(nn.Module):
+    def __init__(self, in_channel, out_channel):
+        super().__init__()
+        self.bn = nn.BatchNorm1d(out_channel)
+        self.conv = nn.Conv1d(in_channel, out_channel, 1)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x), inplace=True))
+class Linears(nn.Sequential):
+    def __init__(self, cns):
+        super().__init__()
+        for i in range(len(cns) - 1):
+            in_cn, out_cn = cns[i], cns[i + 1]
+            self.add_module('linear%d' % (i + 1), LinearBN(in_cn, out_cn))
+class LinearBN(nn.Module):
+    def __init__(self, in_channel, out_channel):
+        super().__init__()
+        self.bn = nn.BatchNorm1d(out_channel)
+        self.conv = nn.Linear(in_channel, out_channel)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x), inplace=True))
+def load_params_with_optimizer(net, filename, to_cpu=False, optimizer=None, logger=None):
+    if not os.path.isfile(filename):
+        raise FileNotFoundError
+    logger.info('==> Loading parameters from checkpoint')
+    checkpoint = torch.load(filename)
+    epoch = checkpoint.get('epoch', -1)
+    it = checkpoint.get('it', 0.0)
+    net.load_state_dict(checkpoint['model_state'])
+    if optimizer is not None:
+        logger.info('==> Loading optimizer parameters from checkpoint')
+        optimizer.load_state_dict(checkpoint['optimizer_state'])
+    logger.info('==> Done')
+    return it, epoch
+def load_params(net, filename, logger=None):
+    if not os.path.isfile(filename):
+        raise FileNotFoundError
+    if logger is not None:
+        logger.info('==> Loading parameters from checkpoint')
+    checkpoint = torch.load(filename)
+    net.load_state_dict(checkpoint['model_state'])
+    if logger is not None:
+        logger.info('==> Done')
+class DBSCANCluster(Function):
+    @staticmethod
+    def forward(ctx, eps: float, min_pts: int, point: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param eps: float, dbscan eps
+        :param min_pts: int, dbscan core point threshold
+        :param point: (B, N, 3) xyz coordinates of the points
+        :return:
+            idx: (B, N) cluster idx
+        """
+        point = point.contiguous()
+        B, N, _ = point.size()
+        idx = torch.cuda.IntTensor(B, N).zero_() - 1
+        pc_util.dbscan_wrapper(B, N, eps, min_pts, point, idx)
+        ctx.mark_non_differentiable(idx)
+        return idx
+    @staticmethod
+    def backward(ctx, grad_out):
+        return ()
+dbscan_cluster = DBSCANCluster.apply
+class GetClusterPts(Function):
+    @staticmethod
+    def forward(ctx, point: torch.Tensor, cluster_idx: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param point: (B, N, 3) xyz coordinates of the points
+        :param cluster_idx: (B, N) cluster idx
+        :return:
+            key_pts: (B, M, 3) cluster center pts, M is max_num_cluster_class
+            num_cluster: (B, M) cluster num, num of pts in each cluster class
+        """
+        cluster_idx = cluster_idx.contiguous()
+        B, N = cluster_idx.size()
+        M = torch.max(cluster_idx) +1
+        key_pts = torch.cuda.FloatTensor(B, M, 3).zero_()
+        num_cluster = torch.cuda.IntTensor(B, M).zero_()
+        pc_util.cluster_pts_wrapper(B, N, M, point, cluster_idx, key_pts, num_cluster)
+        key_pts[key_pts * 1e4 == 0] = -1e1
+        ctx.mark_non_differentiable(key_pts)
+        ctx.mark_non_differentiable(num_cluster)
+        return key_pts, num_cluster
+    @staticmethod
+    def backward(ctx, grad_out):
+        return ()
+get_cluster_pts = GetClusterPts.apply

model/pointnet2.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from .pointnet_util import *
+from .model_utils import *
+from utils import loss_utils
+class PointNet2(nn.Module):
+    def __init__(self, model_cfg, in_channel=3):
+        super().__init__()
+        self.model_cfg = model_cfg
+        self.sa1 = PointNetSAModule(256, 0.1, 16, in_channel, [32, 32, 64])
+        self.sa2 = PointNetSAModule(128, 0.2, 16, 64, [64, 64, 128])
+        self.sa3 = PointNetSAModule(64, 0.4, 16, 128, [128, 128, 256])
+        self.sa4 = PointNetSAModule(16, 0.8, 16, 256, [256, 256, 512])
+        self.fp4 = PointNetFPModule(768, [256, 256])
+        self.fp3 = PointNetFPModule(384, [256, 256])
+        self.fp2 = PointNetFPModule(320, [256, 128])
+        self.fp1 = PointNetFPModule(128, [128, 128, 128])
+        self.shared_fc = Conv1dBN(128, 128)
+        self.drop = nn.Dropout(0.5)
+        self.offset_fc = nn.Conv1d(128, 3, 1)
+        self.cls_fc = nn.Conv1d(128, 1, 1)
+        self.init_weights()
+        self.num_output_feature = 128
+        if self.training:
+            self.train_dict = {}
+            self.add_module(
+                'cls_loss_func',
+                loss_utils.SigmoidBCELoss()
+            )
+            self.add_module(
+                'reg_loss_func',
+                loss_utils.WeightedSmoothL1Loss()
+            )
+            self.loss_weight = self.model_cfg.LossWeight
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            if isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1.0)
+                nn.init.constant_(m.bias, 0)
+    def forward(self, batch_dict):
+        xyz = batch_dict['points']
+        # vectors = batch_dict['vectors']
+        vectors = None
+        if self.training:
+            offset, cls = self.assign_targets(xyz, vectors, self.model_cfg.PosRadius)
+            self.train_dict.update({
+                'offset_label': offset,
+                'cls_label': cls
+            })
+        fea = xyz
+        l0_fea = fea.permute(0, 2, 1)
+        l0_xyz = xyz
+        l1_xyz, l1_fea = self.sa1(l0_xyz, l0_fea)
+        l2_xyz, l2_fea = self.sa2(l1_xyz, l1_fea)
+        l3_xyz, l3_fea = self.sa3(l2_xyz, l2_fea)
+        l4_xyz, l4_fea = self.sa4(l3_xyz, l3_fea)
+        l3_fea = self.fp4(l3_xyz, l4_xyz, l3_fea, l4_fea)
+        l2_fea = self.fp3(l2_xyz, l3_xyz, l2_fea, l3_fea)
+        l1_fea = self.fp2(l1_xyz, l2_xyz, l1_fea, l2_fea)
+        l0_fea = self.fp1(l0_xyz, l1_xyz, None, l1_fea)
+        x = self.drop(self.shared_fc(l0_fea))
+        pred_offset = self.offset_fc(x).permute(0, 2, 1)
+        pred_cls = self.cls_fc(x).permute(0, 2, 1)
+        if self.training:
+            self.train_dict.update({
+                'cls_pred': pred_cls,
+                'offset_pred': pred_offset
+            })
+        batch_dict['point_features'] = l0_fea.permute(0, 2, 1)
+        batch_dict['point_pred_score'] = torch.sigmoid(pred_cls).squeeze(-1)
+        batch_dict['point_pred_offset'] = pred_offset * self.model_cfg.PosRadius
+        return batch_dict
+    def loss(self, loss_dict, disp_dict):
+        pred_cls, pred_offset = self.train_dict['cls_pred'], self.train_dict['offset_pred']
+        label_cls, label_offset = self.train_dict['cls_label'], self.train_dict['offset_label']
+        cls_loss = self.get_cls_loss(pred_cls, label_cls, self.loss_weight['cls_weight'])
+        reg_loss = self.get_reg_loss(pred_offset, label_offset, label_cls, self.loss_weight['reg_weight'])
+        loss = cls_loss + reg_loss
+        loss_dict.update({
+            'pts_cls_loss': cls_loss.item(),
+            'pts_offset_loss': reg_loss.item(),
+            'pts_loss': loss.item()
+        })
+        pred_cls = pred_cls.squeeze(-1)
+        label_cls = label_cls.squeeze(-1)
+        pred_logit = torch.sigmoid(pred_cls)
+        pred = torch.where(pred_logit >= 0.5, pred_logit.new_ones(pred_logit.shape), pred_logit.new_zeros(pred_logit.shape))
+        acc = torch.sum((pred == label_cls) & (label_cls == 1)).item() / torch.sum(label_cls == 1).item()
+        #acc = torch.sum(pred == label_cls).item() / len(label_cls.view(-1))
+        disp_dict.update({'pts_acc': acc})
+        return loss, loss_dict, disp_dict
+    def get_cls_loss(self, pred, label, weight):
+        batch_size = int(pred.shape[0])
+        positives = label > 0
+        negatives = label == 0
+        cls_weights = (negatives * 1.0 + positives * 1.0).float()
+        pos_normalizer = positives.sum(1, keepdim=True).float()
+        cls_weights /= torch.clamp(pos_normalizer, min=1.0)
+        cls_loss_src = self.cls_loss_func(pred.squeeze(-1), label, weights=cls_weights)  # [N, M]
+        cls_loss = cls_loss_src.sum() / batch_size
+        cls_loss = cls_loss * weight
+        return cls_loss
+    def get_reg_loss(self, pred, label, cls_label, weight):
+        batch_size = int(pred.shape[0])
+        positives = cls_label > 0
+        reg_weights = positives.float()
+        pos_normalizer = positives.sum(1, keepdim=True).float()
+        reg_weights /= torch.clamp(pos_normalizer, min=1.0)
+        reg_loss_src = self.reg_loss_func(pred, label, weights=reg_weights)  # [N, M]
+        reg_loss = reg_loss_src.sum() / batch_size
+        reg_loss = reg_loss * weight
+        return reg_loss
+    def assign_targets(self, points, gvs, radius):
+        idx = ball_center_query(radius, points, gvs).type(torch.int64)
+        batch_size = gvs.size()[0]
+        idx_add = torch.arange(batch_size).to(idx.device).unsqueeze(-1).repeat(1, idx.shape[-1]) * gvs.shape[1]
+        gvs = gvs.view(-1, 3)
+        idx_add += idx
+        target_points = gvs[idx_add.view(-1)].view(batch_size, -1, 3)
+        dis = target_points - points
+        dis[idx < 0] = 0
+        dis /= radius
+        label = torch.where(idx >= 0, torch.ones(idx.shape).to(idx.device),
+                            torch.zeros(idx.shape).to(idx.device))
+        return dis, label
+class PointNetSAModuleMSG(nn.Module):
+    def __init__(self, npoint, radii, nsamples, in_channel, mlps, use_xyz=True):
+        """
+        PointNet Set Abstraction Module
+        :param npoint: int
+        :param radii: list of float, radius in ball_query
+        :param nsamples: list of int, number of samples in ball_query
+        :param in_channel: int
+        :param mlps: list of list of int
+        :param use_xyz: bool
+        """
+        super().__init__()
+        assert len(radii) == len(nsamples) == len(mlps)
+        mlps = [[in_channel] + mlp for mlp in mlps]
+        self.npoint = npoint
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            r = radii[i]
+            nsample = nsamples[i]
+            mlp = mlps[i]
+            if use_xyz:
+                mlp[0] += 3
+            self.groupers.append(QueryAndGroup(r, nsample, use_xyz) if npoint is not None else GroupAll(use_xyz))
+            self.mlps.append(Conv2ds(mlp))
+    def forward(self, xyz, features, new_xyz=None):
+        """
+        :param xyz: (B, N, 3) tensor of the xyz coordinates of the features
+        :param features: (B, C, N) tensor of the descriptors of the the features
+        :param new_xyz:
+        :return:
+            new_xyz: (B, npoint, 3) tensor of the new features' xyz
+            new_features: (B, C1, npoint) tensor of the new_features descriptors
+        """
+        new_features_list = []
+        xyz = xyz.contiguous()
+        xyz_flipped = xyz.permute(0, 2, 1)
+        if new_xyz is None:
+            new_xyz = gather_operation(xyz_flipped, furthest_point_sample(
+                xyz, self.npoint, 1.0, 0.0)).permute(0, 2, 1) if self.npoint is not None else None
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](xyz, new_xyz, features)  # (B, C, npoint, nsample)
+            new_features = self.mlps[i](new_features)  # (B, mlp[-1], npoint, nsample)
+            new_features = F.max_pool2d(new_features, kernel_size=[1, new_features.size(3)]).squeeze(-1)
+            new_features_list.append(new_features)
+        return new_xyz, torch.cat(new_features_list, dim=1)
+class PointNetSAModule(PointNetSAModuleMSG):
+    def __init__(self, npoint, radius, nsample, in_channel, mlp, use_xyz=True):
+        super().__init__(npoint, [radius], [nsample], in_channel, [mlp], use_xyz)
+class PointNetFPModule(nn.Module):
+    def __init__(self, in_channel, mlp):
+        super().__init__()
+        self.mlp = Conv2ds([in_channel] + mlp)
+    def forward(self, pts1, pts2, fea1, fea2):
+        """
+        :param pts1: (B, n, 3)
+        :param pts2: (B, m, 3)  n > m
+        :param fea1: (B, C1, n)
+        :param fea2: (B, C2, m)
+        :return:
+            new_features: (B, mlp[-1], n)
+        """
+        if pts2 is not None:
+            dist, idx = three_nn(pts1, pts2)
+            dist_recip = 1.0 / (dist + 1e-8)
+            norm = torch.sum(dist_recip, dim=2, keepdim=True)
+            weight = dist_recip / norm
+            interpolated_feats = three_interpolate(fea2, idx, weight)
+        else:
+            interpolated_feats = fea2.expand(*fea2.size()[0:2], pts1.size(1))
+        if fea1 is not None:
+            new_features = torch.cat([interpolated_feats, fea1], dim=1)  # (B, C2 + C1, n)
+        else:
+            new_features = interpolated_feats
+        new_features = new_features.unsqueeze(-1)
+        new_features = self.mlp(new_features)
+        return new_features.squeeze(-1)
+class QueryAndGroup(nn.Module):
+    def __init__(self, radius: float, nsample: int, use_xyz: bool = True):
+        """
+        :param radius: float, radius of ball
+        :param nsample: int, maximum number of features to gather in the ball
+        :param use_xyz:
+        """
+        super().__init__()
+        self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz
+    def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor, features: torch.Tensor = None):
+        """
+        :param xyz: (B, N, 3) xyz coordinates of the features
+        :param new_xyz: (B, npoint, 3) centroids
+        :param features: (B, C, N) descriptors of the features
+        :return:
+            new_features: (B, 3 + C, npoint, nsample)
+        """
+        idx = ball_query(self.radius, self.nsample, xyz, new_xyz)
+        # _, idx = pointnet_util.knn_query(self.nsample, xyz, new_xyz)
+        xyz_trans = xyz.permute(0, 2, 1)
+        grouped_xyz = grouping_operation(xyz_trans, idx)  # (B, 3, npoint, nsample)
+        grouped_xyz -= new_xyz.permute(0, 2, 1).unsqueeze(-1)
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                new_features = torch.cat([grouped_xyz, grouped_features], dim=1)  # (B, C + 3, npoint, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert self.use_xyz, "Cannot have not features and not use xyz as a feature!"
+            new_features = grouped_xyz
+        return new_features
+class GroupAll(nn.Module):
+    def __init__(self, use_xyz: bool = True):
+        super().__init__()
+        self.use_xyz = use_xyz
+    def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor, features: torch.Tensor = None):
+        """
+        :param xyz: (B, N, 3) xyz coordinates of the features
+        :param new_xyz: ignored
+        :param features: (B, C, N) descriptors of the features
+        :return:
+            new_features: (B, C + 3, 1, N)
+        """
+        grouped_xyz = xyz.permute(0, 2, 1).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                new_features = torch.cat([grouped_xyz, grouped_features], dim=1)  # (B, 3 + C, 1, N)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+        return new_features

model/pointnet_stack_utils.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import torch
+import torch.nn as nn
+from torch.autograd import Function, Variable
+import pc_util
+class BallQuery(Function):
+    @staticmethod
+    def forward(ctx, radius: float, nsample: int, xyz: torch.Tensor, xyz_batch_cnt: torch.Tensor,
+                new_xyz: torch.Tensor, new_xyz_batch_cnt):
+        """
+        Args:
+            ctx:
+            radius: float, radius of the balls
+            nsample: int, maximum number of features in the balls
+            xyz: (N1 + N2 ..., 3) xyz coordinates of the features
+            xyz_batch_cnt: (batch_size), [N1, N2, ...]
+            new_xyz: (M1 + M2 ..., 3) centers of the ball query
+            new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
+        Returns:
+            idx: (M1 + M2, nsample) tensor with the indicies of the features that form the query balls
+        """
+        assert new_xyz.is_contiguous()
+        assert new_xyz_batch_cnt.is_contiguous()
+        assert xyz.is_contiguous()
+        assert xyz_batch_cnt.is_contiguous()
+        B = xyz_batch_cnt.shape[0]
+        M = new_xyz.shape[0]
+        idx = torch.cuda.IntTensor(M, nsample).zero_()
+        pc_util.ball_query_wrapper_stack(B, M, radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx)
+        empty_ball_mask = (idx[:, 0] == -1)
+        idx[empty_ball_mask] = 0
+        return idx, empty_ball_mask
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None, None
+ball_query = BallQuery.apply
+class GroupingOperation(Function):
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, features_batch_cnt: torch.Tensor,
+                idx: torch.Tensor, idx_batch_cnt: torch.Tensor):
+        """
+        Args:
+            ctx:
+            features: (N1 + N2 ..., C) tensor of features to group
+            features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
+            idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
+            idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
+        Returns:
+            output: (M1 + M2, C, nsample) tensor
+        """
+        assert features.is_contiguous()
+        assert features_batch_cnt.is_contiguous()
+        assert idx.is_contiguous()
+        assert idx_batch_cnt.is_contiguous()
+        assert features.shape[0] == features_batch_cnt.sum(), \
+            'features: %s, features_batch_cnt: %s' % (str(features.shape), str(features_batch_cnt))
+        assert idx.shape[0] == idx_batch_cnt.sum(), \
+            'idx: %s, idx_batch_cnt: %s' % (str(idx.shape), str(idx_batch_cnt))
+        M, nsample = idx.size()
+        N, C = features.size()
+        B = idx_batch_cnt.shape[0]
+        output = torch.cuda.FloatTensor(M, C, nsample)
+        pc_util.group_points_wrapper_stack(B, M, C, nsample, features, features_batch_cnt, idx, idx_batch_cnt, output)
+        ctx.for_backwards = (B, N, idx, features_batch_cnt, idx_batch_cnt)
+        return output
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor):
+        """
+        Args:
+            ctx:
+            grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the output from forward
+        Returns:
+            grad_features: (N1 + N2 ..., C) gradient of the features
+        """
+        B, N, idx, features_batch_cnt, idx_batch_cnt = ctx.for_backwards
+        M, C, nsample = grad_out.size()
+        grad_features = Variable(torch.cuda.FloatTensor(N, C).zero_())
+        grad_out_data = grad_out.data.contiguous()
+        pc_util.group_points_grad_wrapper_stack(B, M, C, N, nsample, grad_out_data, idx,
+                                            idx_batch_cnt, features_batch_cnt, grad_features.data)
+        return grad_features, None, None, None
+grouping_operation = GroupingOperation.apply
+class QueryAndGroup(nn.Module):
+    def __init__(self, radius: float, nsample: int, use_xyz: bool = True):
+        """
+        Args:
+            radius: float, radius of ball
+            nsample: int, maximum number of features to gather in the ball
+            use_xyz:
+        """
+        super().__init__()
+        self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz
+    def forward(self, xyz: torch.Tensor, xyz_batch_cnt: torch.Tensor,
+                new_xyz: torch.Tensor, new_xyz_batch_cnt: torch.Tensor,
+                features: torch.Tensor = None):
+        """
+        Args:
+            xyz: (N1 + N2 ..., 3) xyz coordinates of the features
+            xyz_batch_cnt: (batch_size), [N1, N2, ...]
+            new_xyz: (M1 + M2 ..., 3) centers of the ball query
+            new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
+            features: (N1 + N2 ..., C) tensor of features to group
+        Returns:
+            new_features: (M1 + M2, C, nsample) tensor
+        """
+        assert xyz.shape[0] == xyz_batch_cnt.sum(), 'xyz: %s, xyz_batch_cnt: %s' % (str(xyz.shape), str(new_xyz_batch_cnt))
+        assert new_xyz.shape[0] == new_xyz_batch_cnt.sum(), \
+            'new_xyz: %s, new_xyz_batch_cnt: %s' % (str(new_xyz.shape), str(new_xyz_batch_cnt))
+        # idx: (M1 + M2 ..., nsample), empty_ball_mask: (M1 + M2 ...)
+        idx, empty_ball_mask = ball_query(self.radius, self.nsample, xyz, xyz_batch_cnt, new_xyz, new_xyz_batch_cnt)
+        grouped_xyz = grouping_operation(xyz, xyz_batch_cnt, idx, new_xyz_batch_cnt)  # (M1 + M2, 3, nsample)
+        grouped_xyz -= new_xyz.unsqueeze(-1)
+        grouped_xyz[empty_ball_mask] = 0
+        if features is not None:
+            grouped_features = grouping_operation(features, xyz_batch_cnt, idx, new_xyz_batch_cnt)  # (M1 + M2, C, nsample)
+            grouped_features[empty_ball_mask] = 0
+            if self.use_xyz:
+                new_features = torch.cat([grouped_xyz, grouped_features], dim=1)  # (M1 + M2 ..., C + 3, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert self.use_xyz, "Cannot have not features and not use xyz as a feature!"
+            new_features = grouped_xyz
+        return new_features, idx
+class FurthestPointSampling(Function):
+    @staticmethod
+    def forward(ctx, xyz: torch.Tensor, npoint: int):
+        """
+        Args:
+            ctx:
+            xyz: (B, N, 3) where N > npoint
+            npoint: int, number of features in the sampled set
+        Returns:
+            output: (B, npoint) tensor containing the set
+        """
+        assert xyz.is_contiguous()
+        B, N, _ = xyz.size()
+        output = torch.cuda.IntTensor(B, npoint)
+        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
+        pc_util.furthest_point_sampling_wrapper(B, N, npoint, xyz, temp, output)
+        return output
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+furthest_point_sample = FurthestPointSampling.apply
+class ThreeNN(Function):
+    @staticmethod
+    def forward(ctx, unknown, unknown_batch_cnt, known, known_batch_cnt):
+        """
+        Args:
+            ctx:
+            unknown: (N1 + N2..., 3)
+            unknown_batch_cnt: (batch_size), [N1, N2, ...]
+            known: (M1 + M2..., 3)
+            known_batch_cnt: (batch_size), [M1, M2, ...]
+        Returns:
+            dist: (N1 + N2 ..., 3)  l2 distance to the three nearest neighbors
+            idx: (N1 + N2 ..., 3)  index of the three nearest neighbors, range [0, M1+M2+...]
+        """
+        assert unknown.shape.__len__() == 2 and unknown.shape[1] == 3
+        assert known.shape.__len__() == 2 and known.shape[1] == 3
+        assert unknown_batch_cnt.__len__() == known_batch_cnt.__len__()
+        dist2 = unknown.new_zeros(unknown.shape)
+        idx = unknown_batch_cnt.new_zeros(unknown.shape).int()
+        pc_util.three_nn_wrapper_stack(
+            unknown.contiguous(), unknown_batch_cnt.contiguous(),
+            known.contiguous(), known_batch_cnt.contiguous(), dist2, idx
+        )
+        return torch.sqrt(dist2), idx
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None
+three_nn = ThreeNN.apply
+class ThreeInterpolate(Function):
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, idx: torch.Tensor, weight: torch.Tensor):
+        """
+        Args:
+            ctx:
+            features: (M1 + M2 ..., C)
+            idx: [N1 + N2 ..., 3]
+            weight: [N1 + N2 ..., 3]
+        Returns:
+            out_tensor: (N1 + N2 ..., C)
+        """
+        assert idx.shape[0] == weight.shape[0] and idx.shape[1] == weight.shape[1] == 3
+        ctx.three_interpolate_for_backward = (idx, weight, features.shape[0])
+        output = features.new_zeros((idx.shape[0], features.shape[1]))
+        pc_util.three_interpolate_wrapper_stack(features.contiguous(), idx.contiguous(), weight.contiguous(), output)
+        return output
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor):
+        """
+        Args:
+            ctx:
+            grad_out: (N1 + N2 ..., C)
+        Returns:
+            grad_features: (M1 + M2 ..., C)
+        """
+        idx, weight, M = ctx.three_interpolate_for_backward
+        grad_features = grad_out.new_zeros((M, grad_out.shape[1]))
+        pc_util.three_interpolate_grad_wrapper_stack(
+            grad_out.contiguous(), idx.contiguous(), weight.contiguous(), grad_features
+        )
+        return grad_features, None, None
+three_interpolate = ThreeInterpolate.apply
+if __name__ == '__main__':
+    pass

model/pointnet_util.py ADDED Viewed

	@@ -0,0 +1,518 @@

+import torch
+from torch.autograd import Variable
+from torch.autograd import Function
+import torch.nn as nn
+from typing import Tuple
+import pc_util
+# class FurthestPointSampling(Function):
+#     @staticmethod
+#     def forward(ctx, xyz: torch.Tensor, npoint: int, wd: float = 1.0, wf: float = 0.0) -> torch.Tensor:
+#         """
+#         Uses iterative furthest point sampling to select a set of npoint features that have the largest
+#         minimum distance
+#         :param ctx:
+#         :param xyz: (B, N, C) where N > npoint
+#         :param npoint: int, number of features in the sampled set
+#         :param wd: float, weight of xyz distance
+#         :param wf: float, weight of fea distance
+#         :return:
+#              output: (B, npoint) tensor containing the set
+#         """
+#         xyz = xyz.contiguous()
+#         B, N, C = xyz.size()
+#         output = torch.cuda.IntTensor(B, npoint)
+#         temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
+#         pc_util.furthest_point_sampling_wrapper(B, C, N, npoint, wd, wf, xyz, temp, output)
+#         # pc_util.furthest_point_sampling_wrapper(B, N, npoint, xyz, temp, output)
+#         ctx.mark_non_differentiable(output)
+#         return output
+#     @staticmethod
+#     def backward(ctx, grad_out):
+#         return ()
+class FurthestPointSampling(Function):
+    @staticmethod
+    def forward(ctx, xyz: torch.Tensor, npoint: int, wd: float = 1.0, wf: float = 0.0) -> torch.Tensor:
+        """
+        Uses iterative furthest point sampling to select a set of npoint features that have the largest
+        minimum distance.
+        :param ctx:
+        :param xyz: (B, N, C) where N > npoint
+        :param npoint: int, number of features in the sampled set
+        :param wd: float, weight of xyz distance
+        :param wf: float, weight of fea distance
+        :return:
+            output: (B, npoint) tensor containing the set
+        """
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available, and no CPU fallback is implemented.")
+        xyz = xyz.contiguous()
+        B, N, C = xyz.size()
+        device = torch.device('cuda')
+        output = torch.zeros(B, npoint, dtype=torch.int32, device=device)
+        temp = torch.full((B, N), 1e10, dtype=torch.float32, device=device)
+        pc_util.furthest_point_sampling_wrapper(B, C, N, npoint, wd, wf, xyz, temp, output)
+        ctx.mark_non_differentiable(output)
+        return output
+    @staticmethod
+    def backward(ctx, grad_out):
+        return ()
+furthest_point_sample = FurthestPointSampling.apply
+# class GatherOperation(Function):
+#     @staticmethod
+#     def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+#         """
+#         :param ctx:
+#         :param features: (B, C, N)
+#         :param idx: (B, npoint) index tensor of the features to gather
+#         :return:
+#             output: (B, C, npoint)
+#         """
+#         features = features.contiguous()
+#         idx = idx.contiguous()
+#         B, npoint = idx.size()
+#         _, C, N = features.size()
+#         output = torch.cuda.FloatTensor(B, C, npoint)
+#         pc_util.gather_points_wrapper(B, C, N, npoint, features, idx, output)
+#         ctx.save_for_backwards = (idx, features)
+#         return output
+#     @staticmethod
+#     def backward(ctx, grad_out):
+#         idx, features = ctx.saved_tensors
+#         B, npoint = idx.size()
+#         _, C, N = features.size()
+#         grad_features = Variable(torch.cuda.FloatTensor(B, C, N).zero_())
+#         grad_out_data = grad_out.data.contiguous()
+#         pc_util.gather_points_grad_wrapper(B, C, N, npoint, grad_out_data, idx, grad_features.data)
+#         return grad_features, None
+class GatherOperation(Function):
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param features: (B, C, N)
+        :param idx: (B, npoint) index tensor of the features to gather
+        :return:
+            output: (B, C, npoint)
+        """
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available, and no CPU fallback is implemented.")
+        features = features.contiguous()
+        idx = idx.contiguous()
+        B, npoint = idx.size()
+        _, C, N = features.size()
+        device = torch.device('cuda')
+        output = torch.zeros(B, C, npoint, dtype=torch.float32, device=device)
+        pc_util.gather_points_wrapper(B, C, N, npoint, features, idx, output)
+        ctx.save_for_backward(idx, features)
+        return output
+    @staticmethod
+    def backward(ctx, grad_out):
+        idx, features = ctx.saved_tensors
+        B, npoint = idx.size()
+        _, C, N = features.size()
+        device = torch.device('cuda')
+        grad_features = torch.zeros(B, C, N, dtype=torch.float32, device=device)
+        grad_out_data = grad_out.contiguous()
+        pc_util.gather_points_grad_wrapper(B, C, N, npoint, grad_out_data, idx, grad_features)
+        return grad_features, None
+gather_operation = GatherOperation.apply
+class ThreeNN(Function):
+    @staticmethod
+    def forward(ctx, unknown: torch.Tensor, known: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Find the three nearest neighbors of unknown in known
+        :param ctx:
+        :param unknown: (B, N, 3)
+        :param known: (B, M, 3)
+        :return:
+            dist: (B, N, 3) l2 distance to the three nearest neighbors
+            idx: (B, N, 3) index of 3 nearest neighbors
+        """
+        unknown = unknown.contiguous()
+        known = known.contiguous()
+        B, N, _ = unknown.size()
+        m = known.size(1)
+        dist2 = torch.cuda.FloatTensor(B, N, 3)
+        idx = torch.cuda.IntTensor(B, N, 3)
+        pc_util.three_nn_wrapper(B, N, m, unknown, known, dist2, idx)
+        return torch.sqrt(dist2), idx
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return ()
+three_nn = ThreeNN.apply
+class ThreeInterpolate(Function):
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, idx: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        """
+        Performs weight linear interpolation on 3 features
+        :param ctx:
+        :param features: (B, C, M) Features descriptors to be interpolated from
+        :param idx: (B, n, 3) three nearest neighbors of the target features in features
+        :param weight: (B, n, 3) weights
+        :return:
+            output: (B, C, N) tensor of the interpolated features
+        """
+        features = features.contiguous()
+        idx = idx.contiguous()
+        weight = weight.contiguous()
+        B, c, m = features.size()
+        n = idx.size(1)
+        ctx.save_for_backward(idx, weight, features)
+        output = torch.cuda.FloatTensor(B, c, n)
+        pc_util.three_interpolate_wrapper(B, c, m, n, features, idx, weight, output)
+        return output
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        :param ctx:
+        :param grad_out: (B, C, N) tensor with gradients of outputs
+        :return:
+            grad_features: (B, C, M) tensor with gradients of features
+            None:
+            None:
+        """
+        idx, weight, features = ctx.saved_tensors
+        m = features.size(2)
+        B, c, n = grad_out.size()
+        grad_features = Variable(torch.cuda.FloatTensor(B, c, m).zero_())
+        grad_out_data = grad_out.data.contiguous()
+        pc_util.three_interpolate_grad_wrapper(B, c, n, m, grad_out_data, idx, weight, grad_features.data)
+        return grad_features, torch.zeros_like(idx), torch.zeros_like(weight)
+three_interpolate = ThreeInterpolate.apply
+# class GroupingOperation(Function):
+#     @staticmethod
+#     def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+#         """
+#         :param ctx:
+#         :param features: (B, C, N) tensor of features to group
+#         :param idx: (B, npoint, nsample) tensor containing the indicies of features to group with
+#         :return:
+#             output: (B, C, npoint, nsample) tensor
+#         """
+#         features = features.contiguous()
+#         idx = idx.contiguous()
+#         B, nfeatures, nsample = idx.size()
+#         _, C, N = features.size()
+#         output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
+#         pc_util.group_points_wrapper(B, C, N, nfeatures, nsample, features, idx, output)
+#         ctx.save_for_backward(idx, features)
+#         return output
+#     @staticmethod
+#     def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+#         """
+#         :param ctx:
+#         :param grad_out: (B, C, npoint, nsample) tensor of the gradients of the output from forward
+#         :return:
+#             grad_features: (B, C, N) gradient of the features
+#         """
+#         idx, features = ctx.saved_tensors
+#         N = features.size(2)
+#         B, C, npoint, nsample = grad_out.size()
+#         grad_features = Variable(torch.cuda.FloatTensor(B, C, N).zero_())
+#         grad_out_data = grad_out.data.contiguous()
+#         pc_util.group_points_grad_wrapper(B, C, N, npoint, nsample, grad_out_data, idx, grad_features.data)
+#         return grad_features, torch.zeros_like(idx)
+class GroupingOperation(Function):
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param features: (B, C, N) tensor of features to group
+        :param idx: (B, npoint, nsample) tensor containing the indices of features to group with
+        :return:
+            output: (B, C, npoint, nsample) tensor
+        """
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available, and no CPU fallback is implemented.")
+        features = features.contiguous()
+        idx = idx.contiguous()
+        B, npoint, nsample = idx.size()
+        _, C, N = features.size()
+        device = torch.device('cuda')
+        output = torch.zeros(B, C, npoint, nsample, dtype=torch.float32, device=device)
+        pc_util.group_points_wrapper(B, C, N, npoint, nsample, features, idx, output)
+        ctx.save_for_backward(idx, features)
+        return output
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param grad_out: (B, C, npoint, nsample) tensor of the gradients of the output from forward
+        :return:
+            grad_features: (B, C, N) gradient of the features
+        """
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available, and no CPU fallback is implemented.")
+        idx, features = ctx.saved_tensors
+        B, C, N = features.size()
+        _, _, npoint, nsample = grad_out.size()
+        device = torch.device('cuda')
+        grad_features = torch.zeros(B, C, N, dtype=torch.float32, device=device)
+        grad_out_data = grad_out.contiguous()
+        pc_util.group_points_grad_wrapper(B, C, N, npoint, nsample, grad_out_data, idx, grad_features)
+        return grad_features, torch.zeros_like(idx)
+grouping_operation = GroupingOperation.apply
+# class BallQuery(Function):
+#     @staticmethod
+#     def forward(ctx, radius: float, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor) -> torch.Tensor:
+#         """
+#         :param ctx:
+#         :param radius: float, radius of the balls
+#         :param nsample: int, maximum number of features in the balls
+#         :param xyz: (B, N, 3) xyz coordinates of the features
+#         :param new_xyz: (B, npoint, 3) centers of the ball query
+#         :return:
+#             idx: (B, npoint, nsample) tensor with the indicies of the features that form the query balls
+#         """
+#         new_xyz = new_xyz.contiguous()
+#         xyz = xyz.contiguous()
+#         B, N, _ = xyz.size()
+#         npoint = new_xyz.size(1)
+#         idx = torch.cuda.IntTensor(B, npoint, nsample).zero_()
+#         pc_util.ball_query_wrapper(B, N, npoint, radius, nsample, new_xyz, xyz, idx)
+#         ctx.mark_non_differentiable(idx)
+#         return idx
+#     @staticmethod
+#     def backward(ctx, grad_out):
+#         return ()
+class BallQuery(Function):
+    @staticmethod
+    def forward(ctx, radius: float, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param radius: float, radius of the balls
+        :param nsample: int, maximum number of features in the balls
+        :param xyz: (B, N, 3) xyz coordinates of the features
+        :param new_xyz: (B, npoint, 3) centers of the ball query
+        :return:
+            idx: (B, npoint, nsample) tensor with the indices of the features that form the query balls
+        """
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available, and no CPU fallback is implemented.")
+        new_xyz = new_xyz.contiguous()
+        xyz = xyz.contiguous()
+        B, N, _ = xyz.size()
+        npoint = new_xyz.size(1)
+        device = torch.device('cuda')
+        idx = torch.zeros(B, npoint, nsample, dtype=torch.int32, device=device)
+        pc_util.ball_query_wrapper(B, N, npoint, radius, nsample, new_xyz, xyz, idx)
+        ctx.mark_non_differentiable(idx)
+        return idx
+    @staticmethod
+    def backward(ctx, grad_out):
+        return ()
+ball_query = BallQuery.apply
+# class BallCenterQuery(Function):
+#     @staticmethod
+#     def forward(ctx, radius: float, point: torch.Tensor, key_point: torch.Tensor) -> torch.Tensor:
+#         """
+#         :param ctx:
+#         :param radius: float, radius of the balls
+#         :param point: (B, N, 3) xyz coordinates of the features
+#         :param key_point: (B, npoint, 3) centers of the ball query
+#         :return:
+#             idx: (B, N) tensor with the indicies of the features that form the query balls
+#         """
+#         point = point.contiguous()
+#         key_point = key_point.contiguous()
+#         B, N, _ = point.size()
+#         npoint = key_point.size(1)
+#         idx = torch.cuda.IntTensor(B, N).zero_() - 1
+#         pc_util.ball_center_query_wrapper(B, N, npoint, radius, point, key_point, idx)
+#         ctx.mark_non_differentiable(idx)
+#         return idx
+#     @staticmethod
+#     def backward(ctx, grad_out):
+#         return ()
+class BallCenterQuery(Function):
+    @staticmethod
+    def forward(ctx, radius: float, point: torch.Tensor, key_point: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param radius: float, radius of the balls
+        :param point: (B, N, 3) xyz coordinates of the features
+        :param key_point: (B, npoint, 3) centers of the ball query
+        :return:
+            idx: (B, N) tensor with the indices of the features that form the query balls
+        """
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available, and no CPU fallback is implemented.")
+        point = point.contiguous()
+        key_point = key_point.contiguous()
+        B, N, _ = point.size()
+        npoint = key_point.size(1)
+        device = torch.device('cuda')
+        idx = torch.full((B, N), -1, dtype=torch.int32, device=device)
+        pc_util.ball_center_query_wrapper(B, N, npoint, radius, point, key_point, idx)
+        ctx.mark_non_differentiable(idx)
+        return idx
+    @staticmethod
+    def backward(ctx, grad_out):
+        return ()
+ball_center_query = BallCenterQuery.apply
+import numpy as np
+# class KNNQuery(Function):
+#     @staticmethod
+#     def forward(ctx, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+#         """
+#         Find the three nearest neighbors of unknown in known
+#         :param ctx:
+#         :param nsample: int, number of features in knn
+#         :param xyz: (B, N, 3)
+#         :param new_xyz: (B, npoint, 3)
+#         :return:
+#             dist: (B, npoint, nsample) l2 distance to knn
+#             idx: (B, npoint, nsample) index of knn
+#         """
+#         new_xyz = new_xyz.contiguous()
+#         xyz = xyz.contiguous()
+#         B, N, _ = xyz.size()
+#         npoint = new_xyz.size(1)
+#         dist2 = torch.cuda.FloatTensor(np.ones([B, npoint, nsample]) * 1e4)
+#         idx = torch.cuda.IntTensor(B, npoint, nsample)
+#         pc_util.knn_query_wrapper(B, N, npoint, nsample, new_xyz, xyz, dist2, idx)
+#         ctx.mark_non_differentiable(dist2, idx)
+#         return torch.sqrt(dist2), idx
+#     @staticmethod
+#     def backward(ctx, grad_out):
+#         return ()
+class KNNQuery(Function):
+    @staticmethod
+    def forward(ctx, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Find the k nearest neighbors of unknown in known
+        :param ctx:
+        :param nsample: int, number of features in knn
+        :param xyz: (B, N, 3)
+        :param new_xyz: (B, npoint, 3)
+        :return:
+            dist: (B, npoint, nsample) l2 distance to knn
+            idx: (B, npoint, nsample) index of knn
+        """
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available, and no CPU fallback is implemented.")
+        new_xyz = new_xyz.contiguous()
+        xyz = xyz.contiguous()
+        B, N, _ = xyz.size()
+        npoint = new_xyz.size(1)
+        device = torch.device('cuda')
+        dist2 = torch.full((B, npoint, nsample), 1e4, dtype=torch.float32, device=device)
+        idx = torch.zeros((B, npoint, nsample), dtype=torch.int32, device=device)
+        pc_util.knn_query_wrapper(B, N, npoint, nsample, new_xyz, xyz, dist2, idx)
+        ctx.mark_non_differentiable(dist2, idx)
+        return torch.sqrt(dist2), idx
+    @staticmethod
+    def backward(ctx, grad_out):
+        return ()
+knn_query = KNNQuery.apply

model/roofnet.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from .pointnet2 import PointNet2
+from .cluster_refine import ClusterRefineNet
+from .edge_pred_net import EdgeAttentionNet
+import torch.nn as nn
+from sklearn.cluster import DBSCAN
+class RoofNet(nn.Module):
+    def __init__(self, model_cfg, input_channel=3):
+        super().__init__()
+        self.use_edge = False
+        self.model_cfg = model_cfg
+        self.keypoint_det_net = PointNet2(model_cfg.PointNet2, input_channel)
+        self.cluster_refine_net = ClusterRefineNet(model_cfg.ClusterRefineNet, input_channel=self.keypoint_det_net.num_output_feature)
+        self.edge_att_net = EdgeAttentionNet(model_cfg.EdgeAttentionNet, input_channel=self.cluster_refine_net.num_output_feature)
+    def forward(self, batch_dict):
+        batch_dict = self.keypoint_det_net(batch_dict)
+        if self.use_edge:
+            batch_dict = self.cluster_refine_net(batch_dict)
+            batch_dict = self.edge_att_net(batch_dict)
+        if self.training:
+            loss = 0
+            loss_dict = {}
+            disp_dict = {}
+            tmp_loss, loss_dict, disp_dict = self.keypoint_det_net.loss(loss_dict, disp_dict)
+            loss += tmp_loss
+            if self.use_edge:
+                tmp_loss, loss_dict, disp_dict = self.cluster_refine_net.loss(loss_dict, disp_dict)
+                loss += tmp_loss
+                tmp_loss, loss_dict, disp_dict = self.edge_att_net.loss(loss_dict, disp_dict)
+                loss += tmp_loss
+            return loss, loss_dict, disp_dict
+        else:
+            return batch_dict

model_cfg.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+DATA:
+  NPOINT: 1024
+MODEL:
+  PointNet2:
+    PosRadius: 0.15
+    LossWeight: {
+      'cls_weight': 1.0,
+      'reg_weight': 1.0
+    }
+  ClusterRefineNet:
+    ScoreThresh: 0.5
+    MatchRadius: 0.2
+    Cluster:
+      eps: 0.05
+      min_pts: 5
+    RefineSA:
+      Radii: [0.1, 0.2]
+      Nsamples: [16, 16]
+      MLPs: [[128, 128], [128, 128]]
+    LossWeight: {
+      'reg_weight': 1.0
+    }
+  EdgeAttentionNet:
+    LossWeight: {
+      'cls_weight': 1.0,
+    }

output/hoho_test/checkpoint_epoch_90_all.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:beadacd967ce200405fafbb9a3434191606dbda4ee4ab70ddf7f861064861cf7
+size 16980109

output/hoho_test/test/log.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

output/hoho_test/test/submission.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:199824086b9925e081aa30308e43ab1e5c7c269907197516d67582ab75801008
+size 54126

output/hoho_train/ckpt/checkpoint_epoch_41.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f4a7fbbbad3fb17933ecab37a19e7555d6134445d092779c0934a7128e9ab8b
+size 17019805

output/hoho_train/ckpt/checkpoint_epoch_42.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:220030fbbe46caaddc307c32f26e8a9096ef128b0b9b68701a05acaa3b5e6520
+size 17019805

output/hoho_train/ckpt/checkpoint_epoch_43.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61afd8b6e889507930257152cabe814d6f938c76f199fac8cf3e2ad893364abe
+size 17019805

output/hoho_train/ckpt/checkpoint_epoch_44.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f674dfe549c1ce1988af4f91397a14cca5004178933d7902ae71316fa494bf12
+size 17019805

output/hoho_train/ckpt/checkpoint_epoch_45.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8846e86eff7717407c72704e9b4757c429ca8103cf800c2bfb10c27c58abab39
+size 17019805

output/hoho_train/log.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+2024-05-28 18:15:32,724   INFO  **********************Start logging**********************
+2024-05-28 18:15:32,725   INFO  Total samples: 4328
+2024-05-28 18:15:33,700   INFO  **********************Start training**********************
+2024-05-29 16:17:23,568   INFO  **********************Start logging**********************
+2024-05-29 16:17:23,578   INFO  Total samples: 4328
+2024-05-29 16:17:24,693   INFO  ==> Loading parameters from checkpoint
+2024-05-29 16:17:24,732   INFO  ==> Loading optimizer parameters from checkpoint
+2024-05-29 16:17:24,740   INFO  ==> Done
+2024-05-29 16:17:24,740   INFO  **********************Start training**********************

pc_util/setup.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+setup(
+    name='pc_util',
+    version='1.0',
+    ext_modules=[
+        CUDAExtension('pc_util', [
+            'src/pointnet2_api.cpp',
+            'src/ball_query.cpp',
+            'src/ball_query_gpu.cu',
+            'src/group_points.cpp',
+            'src/group_points_gpu.cu',
+            'src/interpolate.cpp',
+            'src/interpolate_gpu.cu',
+            'src/sampling.cpp',
+            'src/sampling_gpu.cu',
+            'src/cluster.cpp',
+            'src/cluster_gpu.cu',
+        ], extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']})
+    ],
+    cmdclass={'build_ext': BuildExtension}
+)

pc_util/src/ball_query.cpp ADDED Viewed

	@@ -0,0 +1,84 @@

+#include <torch/serialize/tensor.h>
+#include <vector>
+// #include <THC/THC.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "ball_query_gpu.h"
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+#define CHECK_CUDA(x) do { \
+	  if (!x.type().is_cuda()) { \
+		      fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_CONTIGUOUS(x) do { \
+	  if (!x.is_contiguous()) { \
+		      fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
+    at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(new_xyz_tensor);
+    CHECK_INPUT(xyz_tensor);
+    const float *new_xyz = new_xyz_tensor.data<float>();
+    const float *xyz = xyz_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    ball_query_kernel_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx);
+    return 1;
+}
+int ball_center_query_wrapper_fast(int b, int n, int m, float radius,
+    at::Tensor point_tensor, at::Tensor key_point_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(point_tensor);
+    CHECK_INPUT(key_point_tensor);
+    const float *point = point_tensor.data<float>();
+    const float *key_point = key_point_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    ball_center_query_kernel_launcher_fast(b, n, m, radius, point, key_point, idx);
+    return 1;
+}
+int knn_query_wrapper_fast(int b, int n, int m, int nsample,
+    at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(new_xyz_tensor);
+    CHECK_INPUT(xyz_tensor);
+    const float *new_xyz = new_xyz_tensor.data<float>();
+    const float *xyz = xyz_tensor.data<float>();
+    float *dist2 = dist2_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    knn_query_kernel_launcher_fast(b, n, m, nsample, new_xyz, xyz, dist2, idx);
+    return 1;
+}
+int ball_query_wrapper_stack(int B, int M, float radius, int nsample,
+    at::Tensor new_xyz_tensor, at::Tensor new_xyz_batch_cnt_tensor,
+    at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(new_xyz_tensor);
+    CHECK_INPUT(xyz_tensor);
+    CHECK_INPUT(new_xyz_batch_cnt_tensor);
+    CHECK_INPUT(xyz_batch_cnt_tensor);
+    const float *new_xyz = new_xyz_tensor.data<float>();
+    const float *xyz = xyz_tensor.data<float>();
+    const int *new_xyz_batch_cnt = new_xyz_batch_cnt_tensor.data<int>();
+    const int *xyz_batch_cnt = xyz_batch_cnt_tensor.data<int>();
+    int *idx = idx_tensor.data<int>();
+    ball_query_kernel_launcher_stack(B, M, radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
+    return 1;
+}

pc_util/src/ball_query_gpu.cu ADDED Viewed

	@@ -0,0 +1,270 @@

+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "ball_query_gpu.h"
+#include "cuda_utils.h"
+__global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int nsample,
+    const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
+    // new_xyz: (B, M, 3)
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, M, nsample)
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || pt_idx >= m) return;
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+    float radius2 = radius * radius;
+    float new_x = new_xyz[0];
+    float new_y = new_xyz[1];
+    float new_z = new_xyz[2];
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+        float x = xyz[k * 3 + 0];
+        float y = xyz[k * 3 + 1];
+        float z = xyz[k * 3 + 2];
+        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+        if (d2 < radius2){
+            if (cnt == 0){
+                for (int l = 0; l < nsample; ++l) {
+                    idx[l] = k;
+                }
+            }
+            idx[cnt] = k;
+            ++cnt;
+            if (cnt >= nsample) break;
+        }
+    }
+}
+void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \
+    const float *new_xyz, const float *xyz, int *idx) {
+    // new_xyz: (B, M, 3)
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, M, nsample)
+    cudaError_t err;
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    ball_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void ball_center_query_kernel_fast(int b, int n, int m, float radius, \
+    const float *__restrict__ point, const float *__restrict__ key_point, int *__restrict__ idx) {
+    // key_point: (B, M, 3)
+    // point: (B, N, 3)
+    // output:
+    //      idx: (B, N)
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || pt_idx >= n) return;
+    point += bs_idx * n * 3 + pt_idx * 3;
+    key_point += bs_idx * m * 3;
+    idx += bs_idx * n + pt_idx;
+    float radius2 = radius * radius;
+    float point_x = point[0];
+    float point_y = point[1];
+    float point_z = point[2];
+    float bestd = 1e8;
+    for (int k = 0; k < m; ++k) {
+        float x = key_point[k * 3 + 0];
+        float y = key_point[k * 3 + 1];
+        float z = key_point[k * 3 + 2];
+        if (((x + 1) * (x + 1) + (y + 1) * (y + 1) + (z + 1) * (z + 1)) < 1e-4) break;
+        float d2 = (point_x - x) * (point_x - x) + (point_y - y) * (point_y - y) + (point_z - z) * (point_z - z);
+        if (d2 < radius2 && d2 < bestd){
+            idx[0] = k;
+            bestd = d2;
+        }
+    }
+}
+void ball_center_query_kernel_launcher_fast(int b, int n, int m, float radius, \
+    const float *point, const float *key_point, int *idx) {
+    // point: (B, n, 3)
+    // key_point: (B, m, 3)
+    // output:
+    //      idx: (B, n)
+    cudaError_t err;
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    ball_center_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, point, key_point, idx);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void knn_query_kernel_fast(int b, int n, int m, int nsample, const float *__restrict__ new_xyz,
+    const float *__restrict__ xyz, float *__restrict__ dist2, int *__restrict__ idx) {
+    // new_xyz: (B, M, 3)
+    // xyz: (B, N, 3)
+    // output:
+    //      dist2: (B, M, nsample)
+    //      idx: (B, M, nsample)
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || pt_idx >= m) return;
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    dist2 += bs_idx * m * nsample + pt_idx * nsample;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+    float nx = new_xyz[0];
+    float ny = new_xyz[1];
+    float nz = new_xyz[2];
+    for (int i = 0; i < n; ++i) {
+        float x = xyz[i * 3 + 0];
+        float y = xyz[i * 3 + 1];
+        float z = xyz[i * 3 + 2];
+        float d2 = (nx - x) * (nx - x) + (ny - y) * (ny - y) + (nz - z) * (nz - z);
+        if (d2 < dist2[nsample - 1]) {
+            dist2[nsample - 1] = d2;
+            idx[nsample - 1] = i;
+            for (int j = nsample - 2; j >= 0; j--) {
+                if (d2 < dist2[j]){
+                    dist2[j + 1] = dist2[j];
+                    dist2[j] = d2;
+                    idx[j + 1] = idx[j];
+                    idx[j] = i;
+                }
+            }
+        }
+    }
+}
+void knn_query_kernel_launcher_fast(int b, int n, int m, int nsample, \
+    const float *new_xyz, const float *xyz, float *dist2, int *idx) {
+    cudaError_t err;
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    knn_query_kernel_fast<<<blocks, threads>>>(b, n, m, nsample, new_xyz, xyz, dist2, idx);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void ball_query_kernel_stack(int B, int M, float radius, int nsample, \
+    const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx) {
+    // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
+    // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
+    // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
+    // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
+    // output:
+    //      idx: (M, nsample)
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= M) return;
+    int bs_idx = 0, pt_cnt = new_xyz_batch_cnt[0];
+    for (int k = 1; k < B; k++){
+        if (pt_idx < pt_cnt) break;
+        pt_cnt += new_xyz_batch_cnt[k];
+        bs_idx = k;
+    }
+    int xyz_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];
+    // for (int k = 0; k < bs_idx; k++) new_xyz_batch_start_idx += new_xyz_batch_cnt[k];
+    new_xyz += pt_idx * 3;
+    xyz += xyz_batch_start_idx * 3;
+    idx += pt_idx * nsample;
+    float radius2 = radius * radius;
+    float new_x = new_xyz[0];
+    float new_y = new_xyz[1];
+    float new_z = new_xyz[2];
+    int n = xyz_batch_cnt[bs_idx];
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+        float x = xyz[k * 3 + 0];
+        float y = xyz[k * 3 + 1];
+        float z = xyz[k * 3 + 2];
+        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+        if (d2 < radius2){
+            if (cnt == 0){
+                for (int l = 0; l < nsample; ++l) {
+                    idx[l] = k;
+                }
+            }
+            idx[cnt] = k;
+            ++cnt;
+            if (cnt >= nsample) break;
+        }
+    }
+    if (cnt == 0) idx[0] = -1;
+}
+void ball_query_kernel_launcher_stack(int B, int M, float radius, int nsample,
+    const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx){
+    // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
+    // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
+    // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
+    // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
+    // output:
+    //      idx: (M, nsample)
+    cudaError_t err;
+    dim3 blocks(DIVUP(M, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    ball_query_kernel_stack<<<blocks, threads>>>(B, M, radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}

pc_util/src/ball_query_gpu.h ADDED Viewed

	@@ -0,0 +1,38 @@

+#ifndef _BALL_QUERY_GPU_H
+#define _BALL_QUERY_GPU_H
+#include <torch/serialize/tensor.h>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
+	at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
+void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample,
+	const float *new_xyz, const float *xyz, int *idx);
+int ball_center_query_wrapper_fast(int b, int n, int m, float radius,
+    at::Tensor point_tensor, at::Tensor key_point_tensor, at::Tensor idx_tensor);
+void ball_center_query_kernel_launcher_fast(int b, int n, int m, float radius,
+    const float *point, const float *key_point, int *idx);
+int knn_query_wrapper_fast(int b, int n, int m, int nsample,
+	at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
+void knn_query_kernel_launcher_fast(int b, int n, int m, int nsample,
+	const float *new_xyz, const float *xyz, float *dist2, int *idx);
+int ball_query_wrapper_stack(int B, int M, float radius, int nsample,
+    at::Tensor new_xyz_tensor, at::Tensor new_xyz_batch_cnt_tensor,
+    at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor);
+void ball_query_kernel_launcher_stack(int B, int M, float radius, int nsample,
+    const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx);
+#endif

pc_util/src/cluster.cpp ADDED Viewed

	@@ -0,0 +1,50 @@

+#include <torch/serialize/tensor.h>
+#include <vector>
+// #include <THC/THC.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "cluster_gpu.h"
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+#define CHECK_CUDA(x) do { \
+	  if (!x.type().is_cuda()) { \
+		      fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_CONTIGUOUS(x) do { \
+	  if (!x.is_contiguous()) { \
+		      fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+int dbscan_wrapper_fast(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(xyz_tensor);
+    const float *xyz = xyz_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    dbscan_kernel_launcher_fast(b, n, eps, min_pts, xyz, idx);
+    return 1;
+}
+int cluster_pts_wrapper_fast(int b, int n, int m, at::Tensor xyz_tensor, at::Tensor idx_tensor,
+    at::Tensor new_xyz_tensor, at::Tensor num_tensor) {
+    CHECK_INPUT(xyz_tensor);
+    CHECK_INPUT(idx_tensor);
+    const float *xyz = xyz_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *new_xyz = new_xyz_tensor.data<float>();
+    int *num = num_tensor.data<int>();
+    cluster_pts_kernel_launcher_fast(b, n, m, xyz, idx, new_xyz, num);
+    return 1;
+}

pc_util/src/cluster_gpu.cu ADDED Viewed

	@@ -0,0 +1,192 @@

+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "cluster_gpu.h"
+#include "cuda_utils.h"
+__device__ float get_dis(float x1, float y1, float z1, float x2, float y2, float z2) {
+	float dis = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2) + (z1 - z2) * (z1 - z2);
+	return sqrt(dis);
+}
+/*
+__device__ void dfs (int i, int c, int n, int min_pts, const int* pts_cnt, const int* pts_adj, int* idx, int label) {
+    idx[i] = c;
+    if(pts_cnt[i] < min_pts) return;
+    for(int j=0;j<n;j++) {
+        int adj = pts_adj[i * n + j];
+        printf("%d   %d     %d\n", i * n, i * n + j, adj);
+        if (adj == -1) break;
+        if (idx[adj] == -1)
+            dfs(adj, c, n, min_pts, pts_cnt, pts_adj, idx, label);
+    }
+}
+*/
+__global__ void dbscan_kernel_fast(int b, int n, float eps, int min_pts, const float *__restrict__ xyz, int *__restrict__ idx,
+    int *__restrict__ pts_cnt, int *__restrict__ pts_adj, int *__restrict__ pts_stack) {
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, N)
+    int bs_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b) return;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * n;
+    pts_cnt += bs_idx * n;
+    pts_stack += bs_idx * n;
+    pts_adj += bs_idx * n * n;
+    for(int i=0;i<n;i++) {
+        pts_cnt[i] = 0;
+        for(int j=0;j<n;j++) {
+            pts_adj[i * n + j] = -1;
+            if(i==j) continue;
+            float x1 = xyz[i * 3 + 0];
+            float y1 = xyz[i * 3 + 1];
+            float z1 = xyz[i * 3 + 2];
+            float x2 = xyz[j * 3 + 0];
+            float y2 = xyz[j * 3 + 1];
+            float z2 = xyz[j * 3 + 2];
+            if(get_dis(x2, y2, z2, -10.0, -10.0, -10.0) < 1e-3) continue;
+            if(get_dis(x1, y1, z1, x2, y2, z2) <= eps) {
+            pts_adj[i * n + pts_cnt[i]] = j;
+                pts_cnt[i] += 1;
+            }
+        }
+    }
+    int cluster_idx = 0;
+    for(int i=0;i<n;i++) {
+        if(idx[i] != -1) continue;
+        if(pts_cnt[i] >= min_pts) {
+            for(int j=0;j<n;j++)
+                pts_stack[j] = -1;
+            pts_stack[0] = i;
+            int stack_idx = 0;
+            int stack_len = 1;
+            while (stack_idx < n && pts_stack[stack_idx] != -1)
+            {
+                int pts_idx = pts_stack[stack_idx];
+                idx[pts_idx] = cluster_idx;
+                if(pts_cnt[pts_idx] < min_pts){
+                    stack_idx += 1;
+                    continue;
+                }
+                for(int j=0;j<n;j++) {
+                    int adj = pts_adj[pts_idx * n + j];
+                    if (adj == -1) break;
+                    if (idx[adj] == -1)
+                    {
+                        idx[adj] = -2;
+                        pts_stack[stack_len++] = adj;
+                    }
+                }
+                stack_idx += 1;
+            }
+            cluster_idx += 1;
+        }
+    }
+}
+void dbscan_kernel_launcher_fast(int b, int n, float eps, int min_pts, const float *xyz, int *idx) {
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, N)
+    cudaError_t err;
+    dim3 blocks(DIVUP(b, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    int* pts_cnt;
+    int* pts_stack;
+	int* pts_adj;
+	err = cudaMalloc((void**)&pts_cnt, b * n * sizeof(int));
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+    err = cudaMalloc((void**)&pts_stack, b * n * sizeof(int));
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+    err = cudaMalloc((void**)&pts_adj, b * n * n * sizeof(int));
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+    dbscan_kernel_fast<<<blocks, threads>>>(b, n, eps, min_pts, xyz, idx, pts_cnt, pts_adj, pts_stack);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    cudaFree(pts_cnt);
+    cudaFree(pts_stack);
+    cudaFree(pts_adj);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void cluster_pts_kernel_fast(int b, int n, int m, const float *__restrict__ xyz, const int *__restrict__ idx,
+    float *__restrict__ new_xyz, int *__restrict__ num) {
+    int bs_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b ) return;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * n;
+    new_xyz += bs_idx * m * 3;
+    num += bs_idx * m;
+    for(int i=0;i<n;i++) {
+        if (idx[i] == -1) continue;
+        int c_idx = idx[i];
+        new_xyz[c_idx * 3 + 0] += xyz[i * 3 + 0];
+        new_xyz[c_idx * 3 + 1] += xyz[i * 3 + 1];
+        new_xyz[c_idx * 3 + 2] += xyz[i * 3 + 2];
+        num[c_idx] += 1;
+    }
+    for(int i=0;i<m;i++) {
+        if (num[i] == 0) break;
+        new_xyz[i * 3 + 0] /= num[i];
+        new_xyz[i * 3 + 1] /= num[i];
+        new_xyz[i * 3 + 2] /= num[i];
+    }
+}
+void cluster_pts_kernel_launcher_fast(int b, int n, int m, const float *xyz, const int *idx, float *new_xyz, int *num) {
+    cudaError_t err;
+    dim3 blocks(DIVUP(b, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    cluster_pts_kernel_fast<<<blocks, threads>>>(b, n, m, xyz, idx, new_xyz, num);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}

pc_util/src/cluster_gpu.h ADDED Viewed

	@@ -0,0 +1,34 @@

+#ifndef _CLUSTER_GPU_H
+#define _CLUSTER_GPU_H
+#include <torch/serialize/tensor.h>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+int dbscan_wrapper_fast(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor idx_tensor);
+void dbscan_kernel_launcher_fast(int b, int n, float eps, int min_pts, const float *xyz, int *idx);
+int cluster_pts_wrapper_fast(int b, int n, int m, at::Tensor xyz_tensor, at::Tensor idx_tensor,
+    at::Tensor new_xyz_tensor, at::Tensor num_tensor);
+void cluster_pts_kernel_launcher_fast(int b, int n, int m, const float *xyz, const int *idx, float *new_xyz, int *num);
+int dbscan_wrapper_stack(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor,
+    at::Tensor idx_tensor);
+void dbscan_kernel_launcher_stack(int b, int n, float eps, int min_pts,
+    const float *xyz, const int *xyz_batch_cnt, int *idx);
+int cluster_pts_wrapper_stack(int B, at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor,
+    at::Tensor new_xyz_tensor, at::Tensor cluster_cnt_tensor);
+void cluster_pts_kernel_launcher_stack(int B, const float *xyz, const int *xyz_batch_cnt, int *idx,
+    const float *new_xyz, const int *cluster_cnt);
+#endif

pc_util/src/cuda_utils.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#ifndef _CUDA_UTILS_H
+#define _CUDA_UTILS_H
+#include <cmath>
+#define TOTAL_THREADS 1024
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+inline int opt_n_threads(int work_size) {
+    const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+    return max(min(1 << pow_2, TOTAL_THREADS), 1);
+}
+#endif

pc_util/src/group_points.cpp ADDED Viewed

	@@ -0,0 +1,98 @@

+#include <torch/serialize/tensor.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <vector>
+// #include <THC/THC.h>
+#include "group_points_gpu.h"
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+#define CHECK_CUDA(x) do { \
+	  if (!x.type().is_cuda()) { \
+		      fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_CONTIGUOUS(x) do { \
+	  if (!x.is_contiguous()) { \
+		      fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
+    float *grad_points = grad_points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    const float *grad_out = grad_out_tensor.data<float>();
+    group_points_grad_kernel_launcher_fast(b, c, n, npoints, nsample, grad_out, idx, grad_points);
+    return 1;
+}
+int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
+    const float *points = points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+    group_points_kernel_launcher_fast(b, c, n, npoints, nsample, points, idx, out);
+    return 1;
+}
+int group_points_grad_wrapper_stack(int B, int M, int C, int N, int nsample,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor,
+    at::Tensor features_batch_cnt_tensor, at::Tensor grad_features_tensor) {
+    CHECK_INPUT(grad_out_tensor);
+    CHECK_INPUT(idx_tensor);
+    CHECK_INPUT(idx_batch_cnt_tensor);
+    CHECK_INPUT(features_batch_cnt_tensor);
+    CHECK_INPUT(grad_features_tensor);
+    const float *grad_out = grad_out_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    const int *idx_batch_cnt = idx_batch_cnt_tensor.data<int>();
+    const int *features_batch_cnt = features_batch_cnt_tensor.data<int>();
+    float *grad_features = grad_features_tensor.data<float>();
+    group_points_grad_kernel_launcher_stack(B, M, C, N, nsample, grad_out, idx, idx_batch_cnt, features_batch_cnt, grad_features);
+    return 1;
+}
+int group_points_wrapper_stack(int B, int M, int C, int nsample,
+    at::Tensor features_tensor, at::Tensor features_batch_cnt_tensor,
+    at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor, at::Tensor out_tensor) {
+    CHECK_INPUT(features_tensor);
+    CHECK_INPUT(features_batch_cnt_tensor);
+    CHECK_INPUT(idx_tensor);
+    CHECK_INPUT(idx_batch_cnt_tensor);
+    CHECK_INPUT(out_tensor);
+    const float *features = features_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    const int *features_batch_cnt = features_batch_cnt_tensor.data<int>();
+    const int *idx_batch_cnt = idx_batch_cnt_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+    group_points_kernel_launcher_stack(B, M, C, nsample, features, features_batch_cnt, idx, idx_batch_cnt, out);
+    return 1;
+}

pc_util/src/group_points_gpu.cu ADDED Viewed

	@@ -0,0 +1,199 @@

+#include <stdio.h>
+#include <stdlib.h>
+#include "cuda_utils.h"
+#include "group_points_gpu.h"
+__global__ void group_points_grad_kernel_fast(int b, int c, int n, int npoints, int nsample,
+    const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
+    // grad_out: (B, C, npoints, nsample)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      grad_points: (B, C, N)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+    int sample_idx = index % nsample;
+    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
+}
+void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
+    const float *grad_out, const int *idx, float *grad_points) {
+    // grad_out: (B, C, npoints, nsample)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      grad_points: (B, C, N)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    group_points_grad_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void group_points_kernel_fast(int b, int c, int n, int npoints, int nsample,
+    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+    // points: (B, C, N)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      out: (B, C, npoints, nsample)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+    int sample_idx = index % nsample;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    out[out_idx] = points[in_idx];
+}
+void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
+    const float *points, const int *idx, float *out) {
+    // points: (B, C, N)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      out: (B, C, npoints, nsample)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    group_points_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, nsample, points, idx, out);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void group_points_grad_kernel_stack(int B, int M, int C, int N, int nsample,
+    const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features) {
+    // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the output from forward
+    // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
+    // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
+    // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
+    // :return:
+    //     grad_features: (N1 + N2 ..., C) gradient of the features
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int sample_idx = index % nsample;
+    int C_idx = (index / nsample) % C;
+    int pt_idx = (index / nsample / C);
+    if (pt_idx >= M || C_idx >= C || sample_idx >= nsample) return;
+    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
+    for (int k = 1; k < B; k++){
+        if (pt_idx < pt_cnt) break;
+        pt_cnt += idx_batch_cnt[k];
+        bs_idx = k;
+    }
+    int features_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k];
+    grad_out += pt_idx * C * nsample + C_idx * nsample + sample_idx;
+    idx += pt_idx * nsample + sample_idx;
+    grad_features += (features_batch_start_idx + idx[0]) * C + C_idx;
+    atomicAdd(grad_features, grad_out[0]);
+}
+void group_points_grad_kernel_launcher_stack(int B, int M, int C, int N, int nsample,
+    const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features) {
+    // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the output from forward
+    // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
+    // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
+    // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
+    // :return:
+    //     grad_features: (N1 + N2 ..., C) gradient of the features
+    cudaError_t err;
+    // dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 blocks(DIVUP(M * C * nsample, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    group_points_grad_kernel_stack<<<blocks, threads>>>(B, M, C, N, nsample, grad_out, idx, idx_batch_cnt, features_batch_cnt, grad_features);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void group_points_kernel_stack(int B, int M, int C, int nsample,
+    const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out) {
+    // :param features: (N1 + N2 ..., C) tensor of features to group
+    // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
+    // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
+    // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
+    // :return:
+    //     output: (M1 + M2, C, nsample) tensor
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int sample_idx = index % nsample;
+    int C_idx = (index / nsample) % C;
+    int pt_idx = (index / nsample / C);
+    if (pt_idx >= M || C_idx >= C || sample_idx >= nsample) return;
+    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
+    for (int k = 1; k < B; k++){
+        if (pt_idx < pt_cnt) break;
+        pt_cnt += idx_batch_cnt[k];
+        bs_idx = k;
+    }
+    int features_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k];
+    features += features_batch_start_idx * C;
+    idx += pt_idx * nsample + sample_idx;
+    int in_idx = idx[0] * C + C_idx;
+    int out_idx = pt_idx * C * nsample + C_idx * nsample + sample_idx;
+    out[out_idx] = features[in_idx];
+}
+void group_points_kernel_launcher_stack(int B, int M, int C, int nsample,
+    const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out) {
+    // :param features: (N1 + N2 ..., C) tensor of features to group
+    // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
+    // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
+    // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
+    // :return:
+    //     output: (M1 + M2, C, nsample) tensor
+    cudaError_t err;
+    dim3 blocks(DIVUP(M * C * nsample, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    group_points_kernel_stack<<<blocks, threads>>>(B, M, C, nsample, features, features_batch_cnt, idx, idx_batch_cnt, out);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}

pc_util/src/group_points_gpu.h ADDED Viewed

	@@ -0,0 +1,36 @@

+#ifndef _GROUP_POINTS_GPU_H
+#define _GROUP_POINTS_GPU_H
+#include <torch/serialize/tensor.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <vector>
+int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
+    const float *points, const int *idx, float *out);
+int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
+void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
+    const float *grad_out, const int *idx, float *grad_points);
+int group_points_wrapper_stack(int B, int M, int C, int nsample,
+    at::Tensor features_tensor, at::Tensor features_batch_cnt_tensor,
+    at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor, at::Tensor out_tensor);
+void group_points_kernel_launcher_stack(int B, int M, int C, int nsample,
+    const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out);
+int group_points_grad_wrapper_stack(int B, int M, int C, int N, int nsample,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor,
+    at::Tensor features_batch_cnt_tensor, at::Tensor grad_features_tensor);
+void group_points_grad_kernel_launcher_stack(int B, int M, int C, int N, int nsample,
+    const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features);
+#endif

pc_util/src/interpolate.cpp ADDED Viewed

	@@ -0,0 +1,148 @@

+#include <torch/serialize/tensor.h>
+#include <vector>
+// #include <THC/THC.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "interpolate_gpu.h"
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+#define CHECK_CUDA(x) do { \
+	  if (!x.type().is_cuda()) { \
+		      fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_CONTIGUOUS(x) do { \
+	  if (!x.is_contiguous()) { \
+		      fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
+    at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
+    const float *unknown = unknown_tensor.data<float>();
+    const float *known = known_tensor.data<float>();
+    float *dist2 = dist2_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    three_nn_kernel_launcher_fast(b, n, m, unknown, known, dist2, idx);
+}
+void three_interpolate_wrapper_fast(int b, int c, int m, int n,
+                         at::Tensor points_tensor,
+                         at::Tensor idx_tensor,
+                         at::Tensor weight_tensor,
+                         at::Tensor out_tensor) {
+    const float *points = points_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    float *out = out_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    three_interpolate_kernel_launcher_fast(b, c, m, n, points, idx, weight, out);
+}
+void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m,
+                            at::Tensor grad_out_tensor,
+                            at::Tensor idx_tensor,
+                            at::Tensor weight_tensor,
+                            at::Tensor grad_points_tensor) {
+    const float *grad_out = grad_out_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    float *grad_points = grad_points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    three_interpolate_grad_kernel_launcher_fast(b, c, n, m, grad_out, idx, weight, grad_points);
+}
+void three_nn_wrapper_stack(at::Tensor unknown_tensor,
+    at::Tensor unknown_batch_cnt_tensor, at::Tensor known_tensor,
+    at::Tensor known_batch_cnt_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor){
+    // unknown: (N1 + N2 ..., 3)
+    // unknown_batch_cnt: (batch_size), [N1, N2, ...]
+    // known: (M1 + M2 ..., 3)
+    // known_batch_cnt: (batch_size), [M1, M2, ...]
+    // Return:
+    // dist: (N1 + N2 ..., 3)  l2 distance to the three nearest neighbors
+    // idx: (N1 + N2 ..., 3)  index of the three nearest neighbors
+    CHECK_INPUT(unknown_tensor);
+    CHECK_INPUT(unknown_batch_cnt_tensor);
+    CHECK_INPUT(known_tensor);
+    CHECK_INPUT(known_batch_cnt_tensor);
+    CHECK_INPUT(dist2_tensor);
+    CHECK_INPUT(idx_tensor);
+    int batch_size = unknown_batch_cnt_tensor.size(0);
+    int N = unknown_tensor.size(0);
+    int M = known_tensor.size(0);
+    const float *unknown = unknown_tensor.data<float>();
+    const int *unknown_batch_cnt = unknown_batch_cnt_tensor.data<int>();
+    const float *known = known_tensor.data<float>();
+    const int *known_batch_cnt = known_batch_cnt_tensor.data<int>();
+    float *dist2 = dist2_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    three_nn_kernel_launcher_stack(batch_size, N, M, unknown, unknown_batch_cnt, known, known_batch_cnt, dist2, idx);
+}
+void three_interpolate_wrapper_stack(at::Tensor features_tensor,
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor) {
+    // features_tensor: (M1 + M2 ..., C)
+    // idx_tensor: [N1 + N2 ..., 3]
+    // weight_tensor: [N1 + N2 ..., 3]
+    // Return:
+    // out_tensor: (N1 + N2 ..., C)
+    CHECK_INPUT(features_tensor);
+    CHECK_INPUT(idx_tensor);
+    CHECK_INPUT(weight_tensor);
+    CHECK_INPUT(out_tensor);
+    int N = out_tensor.size(0);
+    int channels = features_tensor.size(1);
+    const float *features = features_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+    three_interpolate_kernel_launcher_stack(N, channels, features, idx, weight, out);
+}
+void three_interpolate_grad_wrapper_stack(at::Tensor grad_out_tensor, at::Tensor idx_tensor,
+    at::Tensor weight_tensor, at::Tensor grad_features_tensor) {
+    // grad_out_tensor: (N1 + N2 ..., C)
+    // idx_tensor: [N1 + N2 ..., 3]
+    // weight_tensor: [N1 + N2 ..., 3]
+    // Return:
+    // grad_features_tensor: (M1 + M2 ..., C)
+    CHECK_INPUT(grad_out_tensor);
+    CHECK_INPUT(idx_tensor);
+    CHECK_INPUT(weight_tensor);
+    CHECK_INPUT(grad_features_tensor);
+    int N = grad_out_tensor.size(0);
+    int channels = grad_out_tensor.size(1);
+    const float *grad_out = grad_out_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *grad_features = grad_features_tensor.data<float>();
+    // printf("N=%d, channels=%d\n", N, channels);
+    three_interpolate_grad_kernel_launcher_stack(N, channels, grad_out, idx, weight, grad_features);
+}

pc_util/src/interpolate_gpu.cu ADDED Viewed

	@@ -0,0 +1,343 @@

+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "cuda_utils.h"
+#include "interpolate_gpu.h"
+__global__ void three_nn_kernel_fast(int b, int n, int m, const float *__restrict__ unknown,
+    const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
+    // unknown: (B, N, 3)
+    // known: (B, M, 3)
+    // output:
+    //      dist2: (B, N, 3)
+    //      idx: (B, N, 3)
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || pt_idx >= n) return;
+    unknown += bs_idx * n * 3 + pt_idx * 3;
+    known += bs_idx * m * 3;
+    dist2 += bs_idx * n * 3 + pt_idx * 3;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    float ux = unknown[0];
+    float uy = unknown[1];
+    float uz = unknown[2];
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+        float x = known[k * 3 + 0];
+        float y = known[k * 3 + 1];
+        float z = known[k * 3 + 2];
+        float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+        if (d < best1) {
+            best3 = best2; besti3 = besti2;
+            best2 = best1; besti2 = besti1;
+            best1 = d; besti1 = k;
+        }
+        else if (d < best2) {
+            best3 = best2; besti3 = besti2;
+            best2 = d; besti2 = k;
+        }
+        else if (d < best3) {
+            best3 = d; besti3 = k;
+        }
+    }
+    dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
+    idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
+}
+void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
+    const float *known, float *dist2, int *idx) {
+    // unknown: (B, N, 3)
+    // known: (B, M, 3)
+    // output:
+    //      dist2: (B, N, 3)
+    //      idx: (B, N, 3)
+    cudaError_t err;
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_nn_kernel_fast<<<blocks, threads>>>(b, n, m, unknown, known, dist2, idx);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void three_interpolate_kernel_fast(int b, int c, int m, int n, const float *__restrict__ points,
+    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
+    // points: (B, C, M)
+    // idx: (B, N, 3)
+    // weight: (B, N, 3)
+    // output:
+    //      out: (B, C, N)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    out += bs_idx * c * n + c_idx * n;
+    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
+}
+void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
+    const float *points, const int *idx, const float *weight, float *out) {
+    // points: (B, C, M)
+    // idx: (B, N, 3)
+    // weight: (B, N, 3)
+    // output:
+    //      out: (B, C, N)
+    cudaError_t err;
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_interpolate_kernel_fast<<<blocks, threads>>>(b, c, m, n, points, idx, weight, out);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void three_interpolate_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out,
+    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
+    // grad_out: (B, C, N)
+    // weight: (B, N, 3)
+    // output:
+    //      grad_points: (B, C, M)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    grad_points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+}
+void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
+    const int *idx, const float *weight, float *grad_points) {
+    // grad_out: (B, C, N)
+    // weight: (B, N, 3)
+    // output:
+    //      grad_points: (B, C, M)
+    cudaError_t err;
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_interpolate_grad_kernel_fast<<<blocks, threads>>>(b, c, n, m, grad_out, idx, weight, grad_points);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void three_nn_kernel_stack(int batch_size, int N, int M, const float *unknown,
+    const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
+    float *dist2, int *idx) {
+    // unknown: (N1 + N2 ..., 3)
+    // unknown_batch_cnt: (batch_size), [N1, N2, ...]
+    // known: (M1 + M2 ..., 3)
+    // known_batch_cnt: (batch_size), [M1, M2, ...]
+    // Return:
+    // dist: (N1 + N2 ..., 3)  l2 distance to the three nearest neighbors
+    // idx: (N1 + N2 ..., 3)  index of the three nearest neighbors
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= N) return;
+    int bs_idx = 0, pt_cnt = unknown_batch_cnt[0];
+    for (int k = 1; k < batch_size; k++){
+        if (pt_idx < pt_cnt) break;
+        pt_cnt += unknown_batch_cnt[k];
+        bs_idx = k;
+    }
+    int cur_num_known_points = known_batch_cnt[bs_idx];
+    int known_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++) known_batch_start_idx += known_batch_cnt[k];
+    known += known_batch_start_idx * 3;
+    unknown += pt_idx * 3;
+    dist2 += pt_idx * 3;
+    idx += pt_idx * 3;
+    float ux = unknown[0];
+    float uy = unknown[1];
+    float uz = unknown[2];
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < cur_num_known_points; ++k) {
+        float x = known[k * 3 + 0];
+        float y = known[k * 3 + 1];
+        float z = known[k * 3 + 2];
+        float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+        if (d < best1) {
+            best3 = best2; besti3 = besti2;
+            best2 = best1; besti2 = besti1;
+            best1 = d; besti1 = k;
+        }
+        else if (d < best2) {
+            best3 = best2; besti3 = besti2;
+            best2 = d; besti2 = k;
+        }
+        else if (d < best3) {
+            best3 = d; besti3 = k;
+        }
+    }
+    dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
+    idx[0] = besti1 + known_batch_start_idx;
+    idx[1] = besti2 + known_batch_start_idx;
+    idx[2] = besti3 + known_batch_start_idx;
+}
+void three_nn_kernel_launcher_stack(int batch_size, int N, int M, const float *unknown,
+    const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
+    float *dist2, int *idx) {
+    // unknown: (N1 + N2 ..., 3)
+    // unknown_batch_cnt: (batch_size), [N1, N2, ...]
+    // known: (M1 + M2 ..., 3)
+    // known_batch_cnt: (batch_size), [M1, M2, ...]
+    // Return:
+    // dist: (N1 + N2 ..., 3)  l2 distance to the three nearest neighbors
+    // idx: (N1 + N2 ..., 3)  index of the three nearest neighbors
+    cudaError_t err;
+    dim3 blocks(DIVUP(N, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_nn_kernel_stack<<<blocks, threads>>>(
+        batch_size, N, M, unknown, unknown_batch_cnt,
+        known, known_batch_cnt, dist2, idx
+    );
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void three_interpolate_kernel_stack(int N, int channels, const float *features,
+    const int *idx, const float *weight, float *out) {
+    // features: (M1 + M2 ..., C)
+    // idx: [N1 + N2 ..., 3]
+    // weight: [N1 + N2 ..., 3]
+    // Return:
+    // out: (N1 + N2 ..., C)
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= N || c_idx >= channels) return;
+    weight += pt_idx * 3;
+    idx += pt_idx * 3;
+    out += pt_idx * channels + c_idx;
+    out[0] = weight[0] * features[idx[0] * channels + c_idx] +
+        weight[1] * features[idx[1] * channels + c_idx] +
+        weight[2] * features[idx[2] * channels + c_idx];
+}
+void three_interpolate_kernel_launcher_stack(int N, int channels,
+    const float *features, const int *idx, const float *weight, float *out) {
+    // features: (M1 + M2 ..., C)
+    // idx: [N1 + N2 ..., 3]
+    // weight: [N1 + N2 ..., 3]
+    // Return:
+    // out: (N1 + N2 ..., C)
+    cudaError_t err;
+    dim3 blocks(DIVUP(N, THREADS_PER_BLOCK), channels);
+    dim3 threads(THREADS_PER_BLOCK);
+    three_interpolate_kernel_stack<<<blocks, threads>>>(N, channels, features, idx, weight, out);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void three_interpolate_grad_kernel_stack(int N, int channels, const float *grad_out,
+    const int *idx, const float *weight, float *grad_features) {
+    // grad_out_tensor: (N1 + N2 ..., C)
+    // idx_tensor: [N1 + N2 ..., 3]
+    // weight_tensor: [N1 + N2 ..., 3]
+    // Return:
+    // grad_features_tensor: (M1 + M2 ..., C)
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= N || c_idx >= channels) return;
+    grad_out += pt_idx * channels + c_idx;
+    weight += pt_idx * 3;
+    idx += pt_idx * 3;
+    // printf("pt_idx=%d, c_idx=%d, idx=(%d, %d, %d), grad_out=%f\n", pt_idx, c_idx, idx[0], idx[1], idx[2], grad_out[0]);
+    atomicAdd(grad_features + idx[0] * channels + c_idx, grad_out[0] * weight[0]);
+    atomicAdd(grad_features + idx[1] * channels + c_idx, grad_out[0] * weight[1]);
+    atomicAdd(grad_features + idx[2] * channels + c_idx, grad_out[0] * weight[2]);
+}
+void three_interpolate_grad_kernel_launcher_stack(int N, int channels, const float *grad_out,
+    const int *idx, const float *weight, float *grad_features) {
+    // grad_out_tensor: (N1 + N2 ..., C)
+    // idx_tensor: [N1 + N2 ..., 3]
+    // weight_tensor: [N1 + N2 ..., 3]
+    // Return:
+    // grad_features_tensor: (M1 + M2 ..., C)
+    cudaError_t err;
+    dim3 blocks(DIVUP(N, THREADS_PER_BLOCK), channels);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_interpolate_grad_kernel_stack<<<blocks, threads>>>(
+        N, channels, grad_out, idx, weight, grad_features
+    );
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}

pc_util/src/interpolate_gpu.h ADDED Viewed

	@@ -0,0 +1,61 @@

+#ifndef _INTERPOLATE_GPU_H
+#define _INTERPOLATE_GPU_H
+#include <torch/serialize/tensor.h>
+#include<vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
+  at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
+void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
+	const float *known, float *dist2, int *idx);
+void three_interpolate_wrapper_fast(int b, int c, int m, int n, at::Tensor points_tensor,
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
+void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
+    const float *points, const int *idx, const float *weight, float *out);
+void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, at::Tensor grad_out_tensor,
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
+void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
+    const int *idx, const float *weight, float *grad_points);
+void three_nn_wrapper_stack(at::Tensor unknown_tensor,
+    at::Tensor unknown_batch_cnt_tensor, at::Tensor known_tensor,
+    at::Tensor known_batch_cnt_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
+void three_interpolate_wrapper_stack(at::Tensor features_tensor,
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
+void three_interpolate_grad_wrapper_stack(at::Tensor grad_out_tensor, at::Tensor idx_tensor,
+    at::Tensor weight_tensor, at::Tensor grad_features_tensor);
+void three_nn_kernel_launcher_stack(int batch_size, int N, int M, const float *unknown,
+    const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
+    float *dist2, int *idx);
+void three_interpolate_kernel_launcher_stack(int N, int channels,
+    const float *features, const int *idx, const float *weight, float *out);
+void three_interpolate_grad_kernel_launcher_stack(int N, int channels, const float *grad_out,
+    const int *idx, const float *weight, float *grad_features);
+#endif

pc_util/src/pointnet2_api.cpp ADDED Viewed

	@@ -0,0 +1,41 @@

+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+#include "ball_query_gpu.h"
+#include "group_points_gpu.h"
+#include "sampling_gpu.h"
+#include "interpolate_gpu.h"
+#include "cluster_gpu.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("ball_query_wrapper", &ball_query_wrapper_fast, "ball_query_wrapper_fast");
+    m.def("ball_center_query_wrapper", &ball_center_query_wrapper_fast, "ball_center_query_wrapper_fast");
+    m.def("knn_query_wrapper", &knn_query_wrapper_fast, "knn_query_wrapper_fast");
+    m.def("group_points_wrapper", &group_points_wrapper_fast, "group_points_wrapper_fast");
+    m.def("group_points_grad_wrapper", &group_points_grad_wrapper_fast, "group_points_grad_wrapper_fast");
+    m.def("gather_points_wrapper", &gather_points_wrapper_fast, "gather_points_wrapper_fast");
+    m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper_fast, "gather_points_grad_wrapper_fast");
+    m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper");
+    m.def("three_nn_wrapper", &three_nn_wrapper_fast, "three_nn_wrapper_fast");
+    m.def("three_interpolate_wrapper", &three_interpolate_wrapper_fast, "three_interpolate_wrapper_fast");
+    m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper_fast, "three_interpolate_grad_wrapper_fast");
+    m.def("dbscan_wrapper", &dbscan_wrapper_fast, "dbscan_wrapper_fast");
+    m.def("cluster_pts_wrapper", &cluster_pts_wrapper_fast, "cluster_pts_wrapper_fast");
+    m.def("ball_query_wrapper_stack", &ball_query_wrapper_stack, "ball_query_wrapper_stack");
+    m.def("group_points_wrapper_stack", &group_points_wrapper_stack, "group_points_wrapper_stack");
+    m.def("group_points_grad_wrapper_stack", &group_points_grad_wrapper_stack, "group_points_grad_wrapper_stack");
+    m.def("three_nn_wrapper_stack", &three_nn_wrapper_stack, "three_nn_wrapper_stack");
+    m.def("three_interpolate_wrapper_stack", &three_interpolate_wrapper_stack, "three_interpolate_wrapper_stack");
+    m.def("three_interpolate_grad_wrapper_stack", &three_interpolate_grad_wrapper_stack, "three_interpolate_grad_wrapper_stack");
+}

pc_util/src/sampling.cpp ADDED Viewed

	@@ -0,0 +1,46 @@

+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <vector>
+// #include <THC/THC.h>
+#include "sampling_gpu.h"
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+int gather_points_wrapper_fast(int b, int c, int n, int npoints,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor){
+    const float *points = points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+    gather_points_kernel_launcher_fast(b, c, n, npoints, points, idx, out);
+    return 1;
+}
+int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
+    const float *grad_out = grad_out_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *grad_points = grad_points_tensor.data<float>();
+    gather_points_grad_kernel_launcher_fast(b, c, n, npoints, grad_out, idx, grad_points);
+    return 1;
+}
+int furthest_point_sampling_wrapper(int b, int c, int n, int m, float w1, float w2,
+    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) {
+    const float *points = points_tensor.data<float>();
+    float *temp = temp_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    furthest_point_sampling_kernel_launcher(b, c, n, m, w1, w2, points, temp, idx);
+    return 1;
+}

pc_util/src/sampling_gpu.cu ADDED Viewed

	@@ -0,0 +1,259 @@

+#include <stdio.h>
+#include <stdlib.h>
+#include "cuda_utils.h"
+#include "sampling_gpu.h"
+__global__ void gather_points_kernel_fast(int b, int c, int n, int m,
+    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+    // points: (B, C, N)
+    // idx: (B, M)
+    // output:
+    //      out: (B, C, M)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+    out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    points += bs_idx * c * n + c_idx * n;
+    out[0] = points[idx[0]];
+}
+void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
+    const float *points, const int *idx, float *out) {
+    // points: (B, C, N)
+    // idx: (B, npoints)
+    // output:
+    //      out: (B, C, npoints)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    gather_points_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, points, idx, out);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void gather_points_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out,
+    const int *__restrict__ idx, float *__restrict__ grad_points) {
+    // grad_out: (B, C, M)
+    // idx: (B, M)
+    // output:
+    //      grad_points: (B, C, N)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    grad_points += bs_idx * c * n + c_idx * n;
+    atomicAdd(grad_points + idx[0], grad_out[0]);
+}
+void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
+    const float *grad_out, const int *idx, float *grad_points) {
+    // grad_out: (B, C, npoints)
+    // idx: (B, npoints)
+    // output:
+    //      grad_points: (B, C, N)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    gather_points_grad_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, grad_out, idx, grad_points);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2){
+    const float v1 = dists[idx1], v2 = dists[idx2];
+    const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+    dists[idx1] = max(v1, v2);
+    dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_kernel(int b, int c, int n, int m, float w1, float w2,
+    const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) {
+    // dataset: (B, N, 3)
+    // tmp: (B, N)
+    // output:
+    //      idx: (B, M)
+    if (m <= 0) return;
+    __shared__ float dists[block_size];
+    __shared__ int dists_i[block_size];
+    int batch_index = blockIdx.x;
+    dataset += batch_index * n * c;
+    temp += batch_index * n;
+    idxs += batch_index * m;
+    int tid = threadIdx.x;
+    const int stride = block_size;
+    int old = 0;
+    if (threadIdx.x == 0)
+    idxs[0] = old;
+    __syncthreads();
+    for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * c + 0];
+    float y1 = dataset[old * c + 1];
+    float z1 = dataset[old * c + 2];
+    for (int k = tid; k < n; k += stride) {
+        float x2, y2, z2;
+        x2 = dataset[k * c + 0];
+        y2 = dataset[k * c + 1];
+        z2 = dataset[k * c + 2];
+        // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+        // if (mag <= 1e-3)
+        // continue;
+        float xyz_d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+        float fea_d = 0;
+        for (int l = 3; l < c; l++) {
+        fea_d += (dataset[old * c + l] - dataset[k * c + l]) * (dataset[old * c + l] - dataset[k * c + l]);
+        }
+        float d = w1 * xyz_d + w2 * fea_d;
+        float d2 = min(d, temp[k]);
+        temp[k] = d2;
+        besti = d2 > best ? k : besti;
+        best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+    if (block_size >= 1024) {
+        if (tid < 512) {
+            __update(dists, dists_i, tid, tid + 512);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 512) {
+        if (tid < 256) {
+            __update(dists, dists_i, tid, tid + 256);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 256) {
+        if (tid < 128) {
+            __update(dists, dists_i, tid, tid + 128);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 128) {
+        if (tid < 64) {
+            __update(dists, dists_i, tid, tid + 64);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 64) {
+        if (tid < 32) {
+            __update(dists, dists_i, tid, tid + 32);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 32) {
+        if (tid < 16) {
+            __update(dists, dists_i, tid, tid + 16);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 16) {
+        if (tid < 8) {
+            __update(dists, dists_i, tid, tid + 8);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 8) {
+        if (tid < 4) {
+            __update(dists, dists_i, tid, tid + 4);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 4) {
+        if (tid < 2) {
+            __update(dists, dists_i, tid, tid + 2);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 2) {
+        if (tid < 1) {
+            __update(dists, dists_i, tid, tid + 1);
+        }
+        __syncthreads();
+    }
+    old = dists_i[0];
+    if (tid == 0)
+        idxs[j] = old;
+    }
+}
+void furthest_point_sampling_kernel_launcher(int b, int c, int n, int m, float w1, float w2,
+    const float *dataset, float *temp, int *idxs) {
+    // dataset: (B, N, 3)
+    // tmp: (B, N)
+    // output:
+    //      idx: (B, M)
+    cudaError_t err;
+    unsigned int n_threads = opt_n_threads(n);
+    switch (n_threads) {
+        case 1024:
+        furthest_point_sampling_kernel<1024><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 512:
+        furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 256:
+        furthest_point_sampling_kernel<256><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 128:
+        furthest_point_sampling_kernel<128><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 64:
+        furthest_point_sampling_kernel<64><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 32:
+        furthest_point_sampling_kernel<32><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 16:
+        furthest_point_sampling_kernel<16><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 8:
+        furthest_point_sampling_kernel<8><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 4:
+        furthest_point_sampling_kernel<4><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 2:
+        furthest_point_sampling_kernel<2><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 1:
+        furthest_point_sampling_kernel<1><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        default:
+        furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs);
+    }
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}

pc_util/src/sampling_gpu.h ADDED Viewed

	@@ -0,0 +1,29 @@

+#ifndef _SAMPLING_GPU_H
+#define _SAMPLING_GPU_H
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include<vector>
+int gather_points_wrapper_fast(int b, int c, int n, int npoints,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
+    const float *points, const int *idx, float *out);
+int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
+void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
+    const float *grad_out, const int *idx, float *grad_points);
+int furthest_point_sampling_wrapper(int b, int c, int n, int m, float w1, float w2,
+    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
+void furthest_point_sampling_kernel_launcher(int b, int c, int n, int m, float w1, float w2,
+    const float *dataset, float *temp, int *idxs);
+#endif