From 6eb75f2233b5c39943064423ad1ade2b989ae17f Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Sat, 16 Sep 2023 19:05:48 +0900
Subject: [PATCH 01/23] fix cmake

---
 CMakeLists.txt                                | 3 +++
 src/cupoch/integration/scalable_tsdfvolume.cu | 6 ++----
 third_party/stdgpu                            | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f092b47f..caf91744 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -160,6 +160,9 @@ elseif (UNIX)
 endif ()
 
 find_package(CUDA REQUIRED)
+if(NOT CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 86)
+endif()
 include(${CMAKE_SOURCE_DIR}/cmake/CudaComputeTargetFlags.cmake)
 APPEND_TARGET_ARCH_FLAGS()
 if (NOT cuda_nvcc_target_flags)
diff --git a/src/cupoch/integration/scalable_tsdfvolume.cu b/src/cupoch/integration/scalable_tsdfvolume.cu
index d8301c56..4180bd8d 100644
--- a/src/cupoch/integration/scalable_tsdfvolume.cu
+++ b/src/cupoch/integration/scalable_tsdfvolume.cu
@@ -125,8 +125,6 @@ __global__ void OpenVolumeUnitKernel(const Eigen::Vector3f *points,
 struct extract_pointcloud_functor {
     extract_pointcloud_functor(
             const VolumeUnitsMap &volume_units,
-            const stdgpu::device_indexed_range<const VolumeUnitsMap::value_type>
-                    &range,
             int resolution,
             float voxel_length,
             float volume_unit_length,
@@ -135,7 +133,7 @@ struct extract_pointcloud_functor {
             Eigen::Vector3f *normals,
             Eigen::Vector3f *colors)
         : volume_units_(volume_units),
-          range_(range),
+          range_(volume_units.device_range()),
           resolution_(resolution),
           voxel_length_(voxel_length),
           half_voxel_length_(0.5 * voxel_length_),
@@ -363,7 +361,7 @@ std::shared_ptr<geometry::PointCloud> ScalableTSDFVolume::ExtractPointCloud() {
     pointcloud->normals_.resize(3 * n_total, nanvec);
     pointcloud->colors_.resize(3 * n_total, nanvec);
     extract_pointcloud_functor func(
-            impl_->volume_units_, impl_->volume_units_.device_range(),
+            impl_->volume_units_,
             resolution_, voxel_length_, volume_unit_length_, color_type_,
             thrust::raw_pointer_cast(pointcloud->points_.data()),
             thrust::raw_pointer_cast(pointcloud->normals_.data()),
diff --git a/third_party/stdgpu b/third_party/stdgpu
index bc3adaeb..b70405bc 160000
--- a/third_party/stdgpu
+++ b/third_party/stdgpu
@@ -1 +1 @@
-Subproject commit bc3adaeb6e50ae2225c9173db5aeaf6197ce5f80
+Subproject commit b70405bc62958f1f1334a936a20cd025351816fb

From f403039430ec3ada0eace5e126fe6bac9c646c42 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Sat, 16 Sep 2023 21:51:26 +0900
Subject: [PATCH 02/23] cuda12

---
 .github/workflows/ubuntu.yml           |   2 +-
 .github/workflows/windows.yml          |   4 +
 src/cupoch/geometry/laserscanbuffer.cu |   5 +
 src/cupoch/geometry/occupancygrid.cu   |  11 ++
 src/cupoch/geometry/occupancygrid.h    |   4 +
 src/cupoch/utility/device_vector.h     |   3 +-
 src/cupoch/utility/pinned_allocator.h  | 242 +++++++++++++++++++++++++
 src/tests/geometry/pointcloud.cpp      |  25 ++-
 8 files changed, 285 insertions(+), 11 deletions(-)
 create mode 100644 src/cupoch/utility/pinned_allocator.h

diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
index bd76a723..b6cbeedb 100644
--- a/.github/workflows/ubuntu.yml
+++ b/.github/workflows/ubuntu.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
 
     steps:
     - name: Checkout source code
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 326e3708..6afe6ad0 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -28,6 +28,10 @@ jobs:
             cuda: "11.2.0"
             visual_studio: "Visual Studio 16 2019"
             python-version: "3.10"
+          - os: windows-2019
+            cuda: "11.2.0"
+            visual_studio: "Visual Studio 16 2019"
+            python-version: "3.11"
     env:
       build_dir: "build"
       config: "Release"
diff --git a/src/cupoch/geometry/laserscanbuffer.cu b/src/cupoch/geometry/laserscanbuffer.cu
index 8facabbc..4706f5f9 100644
--- a/src/cupoch/geometry/laserscanbuffer.cu
+++ b/src/cupoch/geometry/laserscanbuffer.cu
@@ -298,6 +298,11 @@ template LaserScanBuffer& LaserScanBuffer::AddRanges(
         const Eigen::Matrix4f& transformation,
         const utility::pinned_host_vector<float>& intensities);
 
+template LaserScanBuffer& LaserScanBuffer::AddRanges(
+        const thrust::host_vector<float>& ranges,
+        const Eigen::Matrix4f& transformation,
+        const thrust::host_vector<float>& intensities);
+
 
 class ContainerLikePtr {
 public:
diff --git a/src/cupoch/geometry/occupancygrid.cu b/src/cupoch/geometry/occupancygrid.cu
index e6f7fa1d..de19ee95 100644
--- a/src/cupoch/geometry/occupancygrid.cu
+++ b/src/cupoch/geometry/occupancygrid.cu
@@ -526,6 +526,17 @@ OccupancyGrid& OccupancyGrid::Insert(
     return Insert(dev_points, viewpoint, max_range);
 }
 
+OccupancyGrid& OccupancyGrid::Insert(
+        const thrust::host_vector<Eigen::Vector3f>& points,
+        const Eigen::Vector3f& viewpoint,
+        float max_range) {
+    utility::device_vector<Eigen::Vector3f> dev_points(points.size());
+    cudaSafeCall(cudaMemcpy(
+            thrust::raw_pointer_cast(dev_points.data()), thrust::raw_pointer_cast(points.data()),
+            points.size() * sizeof(Eigen::Vector3f), cudaMemcpyHostToDevice));
+    return Insert(dev_points, viewpoint, max_range);
+}
+
 OccupancyGrid& OccupancyGrid::Insert(const geometry::PointCloud& pointcloud,
                                      const Eigen::Vector3f& viewpoint,
                                      float max_range) {
diff --git a/src/cupoch/geometry/occupancygrid.h b/src/cupoch/geometry/occupancygrid.h
index c06cc001..303e8776 100644
--- a/src/cupoch/geometry/occupancygrid.h
+++ b/src/cupoch/geometry/occupancygrid.h
@@ -108,6 +108,10 @@ class OccupancyGrid : public DenseGrid<OccupancyVoxel> {
             const utility::pinned_host_vector<Eigen::Vector3f>& points,
             const Eigen::Vector3f& viewpoint,
             float max_range = -1.0);
+    OccupancyGrid& Insert(
+            const thrust::host_vector<Eigen::Vector3f>& points,
+            const Eigen::Vector3f& viewpoint,
+            float max_range = -1.0);
     OccupancyGrid& Insert(const PointCloud& pointcloud,
                           const Eigen::Vector3f& viewpoint,
                           float max_range = -1.0);
diff --git a/src/cupoch/utility/device_vector.h b/src/cupoch/utility/device_vector.h
index cdcaf825..1d3e117d 100644
--- a/src/cupoch/utility/device_vector.h
+++ b/src/cupoch/utility/device_vector.h
@@ -34,7 +34,8 @@
 #include <thrust/device_vector.h>
 #endif
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
+#include <cupoch/utility/pinned_allocator.h>
+
 
 #if defined(_WIN32)
 struct float4_t {
diff --git a/src/cupoch/utility/pinned_allocator.h b/src/cupoch/utility/pinned_allocator.h
new file mode 100644
index 00000000..77c1986e
--- /dev/null
+++ b/src/cupoch/utility/pinned_allocator.h
@@ -0,0 +1,242 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cuda/experimental/pinned_allocator.h
+ *  \brief An allocator which creates new elements in "pinned" memory with \p cudaMallocHost
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <stdexcept>
+#include <limits>
+#include <string>
+#include <thrust/system/system_error.h>
+#include <thrust/system/cuda/error.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace system
+{
+
+namespace cuda
+{
+
+namespace experimental
+{
+
+/*! \addtogroup memory_management_classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p pinned_allocator is a CUDA-specific host memory allocator
+ *  that employs \c cudaMallocHost for allocation.
+ *
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
+ */
+template<typename T> class pinned_allocator;
+
+template<>
+  class pinned_allocator<void>
+{
+  public:
+    typedef void           value_type;
+    typedef void       *   pointer;
+    typedef const void *   const_pointer;
+    typedef std::size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+
+    // convert a pinned_allocator<void> to pinned_allocator<U>
+    template<typename U>
+      struct rebind
+    {
+      typedef pinned_allocator<U> other;
+    }; // end rebind
+}; // end pinned_allocator
+
+
+template<typename T>
+  class pinned_allocator
+{
+  public:
+    //! \{
+    typedef T              value_type;
+    typedef T*             pointer;
+    typedef const T*       const_pointer;
+    typedef T&             reference;
+    typedef const T&       const_reference;
+    typedef std::size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+    //! \}
+
+    // convert a pinned_allocator<T> to pinned_allocator<U>
+    template<typename U>
+      struct rebind
+    {
+      typedef pinned_allocator<U> other;
+    }; // end rebind
+
+    /*! \p pinned_allocator's null constructor does nothing.
+     */
+    __host__ __device__
+    inline pinned_allocator() {}
+
+    /*! \p pinned_allocator's null destructor does nothing.
+     */
+    __host__ __device__
+    inline ~pinned_allocator() {}
+
+    /*! \p pinned_allocator's copy constructor does nothing.
+     */
+    __host__ __device__
+    inline pinned_allocator(pinned_allocator const &) {}
+
+    /*! This version of \p pinned_allocator's copy constructor
+     *  is templated on the \c value_type of the \p pinned_allocator
+     *  to copy from.  It is provided merely for convenience; it
+     *  does nothing.
+     */
+    template<typename U>
+    __host__ __device__
+    inline pinned_allocator(pinned_allocator<U> const &) {}
+
+    /*! This method returns the address of a \c reference of
+     *  interest.
+     *
+     *  \p r The \c reference of interest.
+     *  \return \c r's address.
+     */
+    __host__ __device__
+    inline pointer address(reference r) { return &r; }
+
+    /*! This method returns the address of a \c const_reference
+     *  of interest.
+     *
+     *  \p r The \c const_reference of interest.
+     *  \return \c r's address.
+     */
+    __host__ __device__
+    inline const_pointer address(const_reference r) { return &r; }
+
+    /*! This method allocates storage for objects in pinned host
+     *  memory.
+     *
+     *  \p cnt The number of objects to allocate.
+     *  \return a \c pointer to the newly allocated objects.
+     *  \note This method does not invoke \p value_type's constructor.
+     *        It is the responsibility of the caller to initialize the
+     *        objects at the returned \c pointer.
+     */
+    __host__
+    inline pointer allocate(size_type cnt,
+                            const_pointer = 0)
+    {
+      if(cnt > this->max_size())
+      {
+        throw std::bad_alloc();
+      } // end if
+
+      pointer result(0);
+      cudaError_t error = cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type));
+
+      if(error)
+      {
+        cudaGetLastError(); // Clear global CUDA error state.
+        throw std::bad_alloc();
+      } // end if
+
+      return result;
+    } // end allocate()
+
+    /*! This method deallocates pinned host memory previously allocated
+     *  with this \c pinned_allocator.
+     *
+     *  \p p A \c pointer to the previously allocated memory.
+     *  \p cnt The number of objects previously allocated at
+     *         \p p.
+     *  \note This method does not invoke \p value_type's destructor.
+     *        It is the responsibility of the caller to destroy
+     *        the objects stored at \p p.
+     */
+    __host__
+    inline void deallocate(pointer p, size_type /*cnt*/)
+    {
+      cudaError_t error = cudaFreeHost(p);
+
+      cudaGetLastError(); // Clear global CUDA error state.
+
+      if(error)
+      {
+        cudaGetLastError(); // Clear global CUDA error state.
+        throw thrust::system_error(error, thrust::cuda_category());
+      } // end if
+    } // end deallocate()
+
+    /*! This method returns the maximum size of the \c cnt parameter
+     *  accepted by the \p allocate() method.
+     *
+     *  \return The maximum number of objects that may be allocated
+     *          by a single call to \p allocate().
+     */
+    inline size_type max_size() const
+    {
+      return (std::numeric_limits<size_type>::max)() / sizeof(T);
+    } // end max_size()
+
+    /*! This method tests this \p pinned_allocator for equality to
+     *  another.
+     *
+     *  \param x The other \p pinned_allocator of interest.
+     *  \return This method always returns \c true.
+     */
+    __host__ __device__
+    inline bool operator==(pinned_allocator const& x) const { return true; }
+
+    /*! This method tests this \p pinned_allocator for inequality
+     *  to another.
+     *
+     *  \param x The other \p pinned_allocator of interest.
+     *  \return This method always returns \c false.
+     */
+    __host__ __device__
+    inline bool operator!=(pinned_allocator const &x) const { return !operator==(x); }
+}; // end pinned_allocator
+
+/*! \}
+ */
+
+} // end experimental
+
+} // end cuda
+
+} // end system
+
+// alias cuda's members at top-level
+namespace cuda
+{
+
+namespace experimental
+{
+
+using thrust::system::cuda::experimental::pinned_allocator;
+
+} // end experimental
+
+} // end cuda
+
+THRUST_NAMESPACE_END
diff --git a/src/tests/geometry/pointcloud.cpp b/src/tests/geometry/pointcloud.cpp
index eff0ca3a..d66a25c9 100644
--- a/src/tests/geometry/pointcloud.cpp
+++ b/src/tests/geometry/pointcloud.cpp
@@ -178,18 +178,25 @@ TEST(PointCloud, GetOrientedBoundingBox) {
     geometry::OrientedBoundingBox obb;
 
     // Valid 4 points
-    pcd = geometry::PointCloud({{0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {1, 1, 1}});
+    thrust::host_vector<Vector3f> points;
+    points.push_back({0, 0, 0});
+    points.push_back({0, 0, 1});
+    points.push_back({0, 1, 0});
+    points.push_back({1, 1, 1});
+    pcd = geometry::PointCloud(points);
     pcd.GetOrientedBoundingBox();
 
     // 8 points with known ground truth
-    pcd = geometry::PointCloud({{0, 0, 0},
-                                {0, 0, 1},
-                                {0, 2, 0},
-                                {0, 2, 1},
-                                {3, 0, 0},
-                                {3, 0, 1},
-                                {3, 2, 0},
-                                {3, 2, 1}});
+    points.clear();
+    points.push_back({0, 0, 0});
+    points.push_back({0, 0, 1});
+    points.push_back({0, 2, 0});
+    points.push_back({0, 2, 1});
+    points.push_back({3, 0, 0});
+    points.push_back({3, 0, 1});
+    points.push_back({3, 2, 0});
+    points.push_back({3, 2, 1});
+    pcd = geometry::PointCloud(points);
     obb = pcd.GetOrientedBoundingBox();
     EXPECT_EQ(obb.center_, Eigen::Vector3f(1.5, 1, 0.5));
     EXPECT_EQ(obb.extent_, Eigen::Vector3f(3, 2, 1));

From 2fd9d019e15c0d32fd74a6a712ecb198af866967 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Sun, 24 Sep 2023 00:31:35 +0900
Subject: [PATCH 03/23] move header

---
 src/cupoch/knn/lbvh_knn.cu  | 23 ++++++++++++-----------
 src/cupoch/knn/lbvh_knn.h   | 15 +++++++++++----
 src/cupoch/knn/lbvh_knn.inl | 14 ++++++++++++--
 src/python/setup.py         |  1 +
 4 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/src/cupoch/knn/lbvh_knn.cu b/src/cupoch/knn/lbvh_knn.cu
index bd8cd852..c920e250 100644
--- a/src/cupoch/knn/lbvh_knn.cu
+++ b/src/cupoch/knn/lbvh_knn.cu
@@ -21,6 +21,8 @@
 #include <Eigen/Core>
 #include "cupoch/knn/lbvh_knn.h"
 
+#include <lbvh_index/aabb.cuh>
+#include <lbvh_index/lbvh.cuh>
 #include <lbvh_index/lbvh_kernels.cuh>
 #include "cupoch/utility/eigen.h"
 #include "cupoch/utility/platform.h"
@@ -72,29 +74,29 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
     thrust::transform(data.begin(), data.end(), make_tuple_begin(aabbs, data_float3_), convert_float3_and_aabb_functor<T>());
     T min_data = utility::ComputeMinBound<T::SizeAtCompileTime, typename T::Scalar>(data);
     T max_data = utility::ComputeMaxBound<T::SizeAtCompileTime, typename T::Scalar>(data);
-    extent_.min = make_float3(min_data[0], min_data[1], min_data[2]);
-    extent_.max = make_float3(max_data[0], max_data[1], max_data[2]);
+    extent_.first = Eigen::Vector3f(min_data[0], min_data[1], min_data[2]);
+    extent_.second = Eigen::Vector3f(max_data[0], max_data[1], max_data[2]);
 
     utility::device_vector<unsigned long long int> morton_codes(n_points_);
     thrust::transform(
         aabbs.begin(), aabbs.end(), morton_codes.begin(),
-        [this] __device__ (const lbvh::AABB& aabb) { return lbvh::morton_code(aabb, extent_); });
+        [this] __device__ (const lbvh::AABB& aabb) { return lbvh::morton_code(aabb, to_float3_aabb(extent_)); });
     dim3 block_dim, grid_dim;
     std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(n_points_);
     compute_morton_points_kernel<<<grid_dim, block_dim>>>(
-        thrust::raw_pointer_cast(data_float3_.data()), extent_, thrust::raw_pointer_cast(morton_codes.data()), n_points_);
+        thrust::raw_pointer_cast(data_float3_.data()), to_float3_aabb(extent_), thrust::raw_pointer_cast(morton_codes.data()), n_points_);
     cudaSafeCall(cudaDeviceSynchronize());
     sorted_indices_.resize(morton_codes.size());
     thrust::sequence(sorted_indices_.begin(), sorted_indices_.end());
     thrust::sort_by_key(morton_codes.begin(), morton_codes.end(), make_tuple_begin(sorted_indices_, aabbs));
 
-    nodes_.resize(n_nodes_);
+    nodes_->resize(n_nodes_);
     initialize_tree_kernel<<<grid_dim, block_dim>>>(
-        thrust::raw_pointer_cast(nodes_.data()), thrust::raw_pointer_cast(aabbs.data()), n_points_);
+        thrust::raw_pointer_cast(nodes_->data()), thrust::raw_pointer_cast(aabbs.data()), n_points_);
     cudaSafeCall(cudaDeviceSynchronize());
     thrust::device_vector<unsigned int> root_node_index(1, std::numeric_limits<unsigned int>::max());
     construct_tree_kernel<<<grid_dim, block_dim>>>(
-        thrust::raw_pointer_cast(nodes_.data()),
+        thrust::raw_pointer_cast(nodes_->data()),
         thrust::raw_pointer_cast(root_node_index.data()),
         thrust::raw_pointer_cast(morton_codes.data()), n_points_);
     cudaSafeCall(cudaDeviceSynchronize());
@@ -102,7 +104,7 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
     if (leaf_size_ > 1) {
         utility::device_vector<unsigned int> valid(n_nodes_, 1);
         optimize_tree_kernel<<<grid_dim, block_dim>>>(
-            thrust::raw_pointer_cast(nodes_.data()),
+            thrust::raw_pointer_cast(nodes_->data()),
             thrust::raw_pointer_cast(root_node_index.data()),
             thrust::raw_pointer_cast(valid.data()), leaf_size_, n_points_);
         cudaSafeCall(cudaDeviceSynchronize());
@@ -125,14 +127,13 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
             unsigned int first_moved = valid_sums[new_node_count];
             std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(n_nodes_);
             compact_tree_kernel<<<grid_dim, block_dim>>>(
-                thrust::raw_pointer_cast(nodes_.data()),
+                thrust::raw_pointer_cast(nodes_->data()),
                 thrust::raw_pointer_cast(root_node_index.data()),
                 thrust::raw_pointer_cast(valid_sums.data()),
                 thrust::raw_pointer_cast(free.data()),
                 first_moved, new_node_count, n_nodes_);
             if (shrink_to_fit_) {
-                utility::device_vector<lbvh::BVHNode> nodes_old(nodes_);
-                nodes_.resize(new_node_count);
+                nodes_->resize(new_node_count);
             }
             n_nodes_ = new_node_count;
         }
diff --git a/src/cupoch/knn/lbvh_knn.h b/src/cupoch/knn/lbvh_knn.h
index 32bed373..d8cadf47 100644
--- a/src/cupoch/knn/lbvh_knn.h
+++ b/src/cupoch/knn/lbvh_knn.h
@@ -19,12 +19,16 @@
  * IN THE SOFTWARE.
  **/
 #pragma once
+#include <Eigen/Core>
+
 #include "cupoch/knn/lbvh_knn.h"
-#include <lbvh_index/aabb.cuh>
-#include <lbvh_index/lbvh.cuh>
 
 #include "cupoch/utility/device_vector.h"
 
+namespace lbvh {
+struct BVHNode;
+}
+
 namespace cupoch {
 
 namespace geometry {
@@ -33,6 +37,9 @@ class Geometry;
 
 namespace knn {
 
+using AABB = std::pair<Eigen::Vector3f, Eigen::Vector3f>;
+
+
 class LinearBoundingVolumeHierarchyKNN {
 public:
     LinearBoundingVolumeHierarchyKNN(size_t leaf_size = 32, bool compact = true, bool sort_queries = true, bool shrink_to_fit = false);
@@ -67,8 +74,8 @@ class LinearBoundingVolumeHierarchyKNN {
     bool sort_queries_;
     bool shrink_to_fit_;
 
-    lbvh::AABB extent_;
-    utility::device_vector<lbvh::BVHNode> nodes_;
+    AABB extent_;
+    std::unique_ptr<utility::device_vector<lbvh::BVHNode>> nodes_;
     utility::device_vector<float3> data_float3_;
     utility::device_vector<unsigned int> sorted_indices_;
     unsigned int root_node_index_;
diff --git a/src/cupoch/knn/lbvh_knn.inl b/src/cupoch/knn/lbvh_knn.inl
index 525d785f..0dfe3a6c 100644
--- a/src/cupoch/knn/lbvh_knn.inl
+++ b/src/cupoch/knn/lbvh_knn.inl
@@ -21,6 +21,8 @@
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include "lbvh_knn.h"
+#include <lbvh_index/aabb.cuh>
+#include <lbvh_index/lbvh.cuh>
 #include <lbvh_index/lbvh_kernels.cuh>
 #include <lbvh_index/query_knn_kernels.cuh>
 
@@ -38,6 +40,14 @@ struct convert_float3_functor {
     }
 };
 
+__device__ __host__
+lbvh::AABB to_float3_aabb(const cupoch::knn::AABB& aabb) {
+    lbvh::AABB aabb_f3;
+    aabb_f3.min = make_float3(aabb.first[0], aabb.first[1], aabb.first[2]);
+    aabb_f3.max = make_float3(aabb.second[0], aabb.second[1], aabb.second[2]);
+    return aabb_f3;
+}
+
 }
 namespace cupoch {
 namespace knn {
@@ -59,7 +69,7 @@ int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
         dim3 block_dim, grid_dim;
         std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(num_query);
         compute_morton_points_kernel<<<grid_dim, block_dim>>>(
-            thrust::raw_pointer_cast(data_float3.data()), extent_, thrust::raw_pointer_cast(morton_codes.data()), num_query);
+            thrust::raw_pointer_cast(data_float3.data()), to_float3_aabb(extent_), thrust::raw_pointer_cast(morton_codes.data()), num_query);
         cudaSafeCall(cudaDeviceSynchronize());
         thrust::sort_by_key(morton_codes.begin(), morton_codes.end(), sorted_indices.begin());
     }
@@ -71,7 +81,7 @@ int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
     utility::device_vector<unsigned int> neighbors(num_query, 0);
 
     query_knn_kernel<<<grid_dim, block_dim>>>(
-        thrust::raw_pointer_cast(nodes_.data()),
+        thrust::raw_pointer_cast(nodes_->data()),
         thrust::raw_pointer_cast(data_float3_.data()),
         thrust::raw_pointer_cast(sorted_indices_.data()),
         root_node_index_,
diff --git a/src/python/setup.py b/src/python/setup.py
index ba0017e3..9784e713 100644
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -54,6 +54,7 @@ def find_stubs(package):
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
         "Topic :: Education",
         "Topic :: Multimedia :: Graphics :: 3D Rendering",
         "Topic :: Multimedia :: Graphics :: Viewers",

From 4c08d54b06e632ae40ac5d60576d81b8e4d27845 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Sun, 24 Sep 2023 01:29:58 +0900
Subject: [PATCH 04/23] remove lbvh_knn.inl

---
 src/cupoch/knn/lbvh_knn.cu  |  67 ++++++++++++++++++++++++
 src/cupoch/knn/lbvh_knn.h   |   3 --
 src/cupoch/knn/lbvh_knn.inl | 100 ------------------------------------
 3 files changed, 67 insertions(+), 103 deletions(-)
 delete mode 100644 src/cupoch/knn/lbvh_knn.inl

diff --git a/src/cupoch/knn/lbvh_knn.cu b/src/cupoch/knn/lbvh_knn.cu
index c920e250..2b297051 100644
--- a/src/cupoch/knn/lbvh_knn.cu
+++ b/src/cupoch/knn/lbvh_knn.cu
@@ -19,17 +19,40 @@
  * IN THE SOFTWARE.
  **/
 #include <Eigen/Core>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
 #include "cupoch/knn/lbvh_knn.h"
 
 #include <lbvh_index/aabb.cuh>
 #include <lbvh_index/lbvh.cuh>
 #include <lbvh_index/lbvh_kernels.cuh>
+#include <lbvh_index/query_knn_kernels.cuh>
+
 #include "cupoch/utility/eigen.h"
 #include "cupoch/utility/platform.h"
 #include "cupoch/utility/helper.h"
 
 
 namespace {
+
+template<int Dim>
+struct convert_float3_functor {
+    convert_float3_functor() {}
+    __device__
+    float3 operator() (const Eigen::Matrix<float, Dim, 1>& x) {
+        return make_float3(x[0], x[1], x[2]);
+    }
+};
+
+
+__device__ __host__
+lbvh::AABB to_float3_aabb(const cupoch::knn::AABB& aabb) {
+    lbvh::AABB aabb_f3;
+    aabb_f3.min = make_float3(aabb.first[0], aabb.first[1], aabb.first[2]);
+    aabb_f3.max = make_float3(aabb.second[0], aabb.second[1], aabb.second[2]);
+    return aabb_f3;
+}
+
 template<typename T>
 struct convert_float3_and_aabb_functor {
     convert_float3_and_aabb_functor() {}
@@ -142,6 +165,50 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
     return true;
 }
 
+template <typename InputIterator, int Dim>
+int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
+                                                InputIterator last,
+                                                int knn,
+                                                utility::device_vector<unsigned int> &indices,
+                                                utility::device_vector<float> &distance2) const {
+    size_t num_query = thrust::distance(first, last);
+    utility::device_vector<float3> data_float3(num_query);
+    thrust::transform(first, last, data_float3.begin(), convert_float3_functor<Dim>());
+
+    utility::device_vector<unsigned long long int> morton_codes(num_query);
+    utility::device_vector<unsigned int> sorted_indices(num_query);
+    thrust::sequence(sorted_indices.begin(), sorted_indices.end());
+    if (sort_queries_) {
+        dim3 block_dim, grid_dim;
+        std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(num_query);
+        compute_morton_points_kernel<<<grid_dim, block_dim>>>(
+            thrust::raw_pointer_cast(data_float3.data()), to_float3_aabb(extent_), thrust::raw_pointer_cast(morton_codes.data()), num_query);
+        cudaSafeCall(cudaDeviceSynchronize());
+        thrust::sort_by_key(morton_codes.begin(), morton_codes.end(), sorted_indices.begin());
+    }
+
+    dim3 block_dim, grid_dim;
+    std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(num_query);
+    indices.resize(num_query * knn, std::numeric_limits<unsigned int>::max());
+    distance2.resize(num_query * knn, std::numeric_limits<float>::max());
+    utility::device_vector<unsigned int> neighbors(num_query, 0);
+
+    query_knn_kernel<<<grid_dim, block_dim>>>(
+        thrust::raw_pointer_cast(nodes_->data()),
+        thrust::raw_pointer_cast(data_float3_.data()),
+        thrust::raw_pointer_cast(sorted_indices_.data()),
+        root_node_index_,
+        std::numeric_limits<float>::max(),
+        thrust::raw_pointer_cast(data_float3.data()),
+        thrust::raw_pointer_cast(sorted_indices.data()),
+        num_query,
+        thrust::raw_pointer_cast(indices.data()),
+        thrust::raw_pointer_cast(distance2.data()),
+        thrust::raw_pointer_cast(neighbors.data()));
+    cudaSafeCall(cudaDeviceSynchronize());
+    return 1;
+}
+
 template bool LinearBoundingVolumeHierarchyKNN::SetRawData<Eigen::Vector3f>(
         const utility::device_vector<Eigen::Vector3f> &data);
 
diff --git a/src/cupoch/knn/lbvh_knn.h b/src/cupoch/knn/lbvh_knn.h
index d8cadf47..b4daff23 100644
--- a/src/cupoch/knn/lbvh_knn.h
+++ b/src/cupoch/knn/lbvh_knn.h
@@ -21,8 +21,6 @@
 #pragma once
 #include <Eigen/Core>
 
-#include "cupoch/knn/lbvh_knn.h"
-
 #include "cupoch/utility/device_vector.h"
 
 namespace lbvh {
@@ -84,4 +82,3 @@ class LinearBoundingVolumeHierarchyKNN {
 }
 }
 
-#include "cupoch/knn/lbvh_knn.inl"
diff --git a/src/cupoch/knn/lbvh_knn.inl b/src/cupoch/knn/lbvh_knn.inl
deleted file mode 100644
index 0dfe3a6c..00000000
--- a/src/cupoch/knn/lbvh_knn.inl
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Copyright (c) 2022 Neka-Nat
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- **/
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include "lbvh_knn.h"
-#include <lbvh_index/aabb.cuh>
-#include <lbvh_index/lbvh.cuh>
-#include <lbvh_index/lbvh_kernels.cuh>
-#include <lbvh_index/query_knn_kernels.cuh>
-
-#include "cupoch/utility/platform.h"
-#include "cupoch/utility/helper.h"
-
-namespace {
-
-template<int Dim>
-struct convert_float3_functor {
-    convert_float3_functor() {}
-    __device__
-    float3 operator() (const Eigen::Matrix<float, Dim, 1>& x) {
-        return make_float3(x[0], x[1], x[2]);
-    }
-};
-
-__device__ __host__
-lbvh::AABB to_float3_aabb(const cupoch::knn::AABB& aabb) {
-    lbvh::AABB aabb_f3;
-    aabb_f3.min = make_float3(aabb.first[0], aabb.first[1], aabb.first[2]);
-    aabb_f3.max = make_float3(aabb.second[0], aabb.second[1], aabb.second[2]);
-    return aabb_f3;
-}
-
-}
-namespace cupoch {
-namespace knn {
-
-template <typename InputIterator, int Dim>
-int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
-                                                InputIterator last,
-                                                int knn,
-                                                utility::device_vector<unsigned int> &indices,
-                                                utility::device_vector<float> &distance2) const {
-    size_t num_query = thrust::distance(first, last);
-    utility::device_vector<float3> data_float3(num_query);
-    thrust::transform(first, last, data_float3.begin(), convert_float3_functor<Dim>());
-
-    utility::device_vector<unsigned long long int> morton_codes(num_query);
-    utility::device_vector<unsigned int> sorted_indices(num_query);
-    thrust::sequence(sorted_indices.begin(), sorted_indices.end());
-    if (sort_queries_) {
-        dim3 block_dim, grid_dim;
-        std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(num_query);
-        compute_morton_points_kernel<<<grid_dim, block_dim>>>(
-            thrust::raw_pointer_cast(data_float3.data()), to_float3_aabb(extent_), thrust::raw_pointer_cast(morton_codes.data()), num_query);
-        cudaSafeCall(cudaDeviceSynchronize());
-        thrust::sort_by_key(morton_codes.begin(), morton_codes.end(), sorted_indices.begin());
-    }
-
-    dim3 block_dim, grid_dim;
-    std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(num_query);
-    indices.resize(num_query * knn, std::numeric_limits<unsigned int>::max());
-    distance2.resize(num_query * knn, std::numeric_limits<float>::max());
-    utility::device_vector<unsigned int> neighbors(num_query, 0);
-
-    query_knn_kernel<<<grid_dim, block_dim>>>(
-        thrust::raw_pointer_cast(nodes_->data()),
-        thrust::raw_pointer_cast(data_float3_.data()),
-        thrust::raw_pointer_cast(sorted_indices_.data()),
-        root_node_index_,
-        std::numeric_limits<float>::max(),
-        thrust::raw_pointer_cast(data_float3.data()),
-        thrust::raw_pointer_cast(sorted_indices.data()),
-        num_query,
-        thrust::raw_pointer_cast(indices.data()),
-        thrust::raw_pointer_cast(distance2.data()),
-        thrust::raw_pointer_cast(neighbors.data()));
-    cudaSafeCall(cudaDeviceSynchronize());
-    return 1;
-}
-
-}
-}
\ No newline at end of file

From ecae46e7da781d3f5fdb9ce9d91d4ab266fd5f9a Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Sun, 24 Sep 2023 01:45:15 +0900
Subject: [PATCH 05/23] add host query function

---
 src/cupoch/knn/lbvh_knn.cu | 34 ++++++++++++++++++++++++++++++++++
 src/cupoch/knn/lbvh_knn.h  |  8 +++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/cupoch/knn/lbvh_knn.cu b/src/cupoch/knn/lbvh_knn.cu
index 2b297051..a69b0516 100644
--- a/src/cupoch/knn/lbvh_knn.cu
+++ b/src/cupoch/knn/lbvh_knn.cu
@@ -74,6 +74,14 @@ namespace knn {
 LinearBoundingVolumeHierarchyKNN::LinearBoundingVolumeHierarchyKNN(size_t leaf_size, bool compact, bool sort_queries, bool shrink_to_fit)
     : leaf_size_(leaf_size), compact_(compact), sort_queries_(sort_queries), shrink_to_fit_(shrink_to_fit) {}
 
+LinearBoundingVolumeHierarchyKNN::LinearBoundingVolumeHierarchyKNN(const utility::device_vector<Eigen::Vector3f> &data,
+                                                                   size_t leaf_size, bool compact, bool sort_queries, bool shrink_to_fit)
+    : leaf_size_(leaf_size), compact_(compact), sort_queries_(sort_queries), shrink_to_fit_(shrink_to_fit) {
+    SetRawData(data);
+}
+
+LinearBoundingVolumeHierarchyKNN::~LinearBoundingVolumeHierarchyKNN() {}
+
 template <typename T>
 int LinearBoundingVolumeHierarchyKNN::SearchKNN(const utility::device_vector<T> &query,
                                                 int knn,
@@ -88,6 +96,20 @@ int LinearBoundingVolumeHierarchyKNN::SearchKNN(const utility::device_vector<T>
                                            indices, distance2);
 }
 
+template <typename T>
+int LinearBoundingVolumeHierarchyKNN::SearchKNN(const T &query,
+                           int knn,
+                           thrust::host_vector<unsigned int> &indices,
+                           thrust::host_vector<float> &distance2) const {
+    utility::device_vector<T> query_dv(1, query);
+    utility::device_vector<unsigned int> indices_dv;
+    utility::device_vector<float> distance2_dv;
+    auto result = SearchKNN<T>(query_dv, knn, indices_dv, distance2_dv);
+    indices = indices_dv;
+    distance2 = distance2_dv;
+    return result;
+}
+
 template <typename T>
 bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T> &data) {
     n_points_ = data.size();
@@ -209,6 +231,18 @@ int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
     return 1;
 }
 
+template int LinearBoundingVolumeHierarchyKNN::SearchKNN<Eigen::Vector3f>(
+        const utility::device_vector<Eigen::Vector3f> &query,
+        int knn,
+        utility::device_vector<unsigned int> &indices,
+        utility::device_vector<float> &distance2) const;
+
+template int LinearBoundingVolumeHierarchyKNN::SearchKNN<Eigen::Vector3f>(
+        const Eigen::Vector3f &query,
+        int knn,
+        thrust::host_vector<unsigned int> &indices,
+        thrust::host_vector<float> &distance2) const;
+
 template bool LinearBoundingVolumeHierarchyKNN::SetRawData<Eigen::Vector3f>(
         const utility::device_vector<Eigen::Vector3f> &data);
 
diff --git a/src/cupoch/knn/lbvh_knn.h b/src/cupoch/knn/lbvh_knn.h
index b4daff23..0c786d29 100644
--- a/src/cupoch/knn/lbvh_knn.h
+++ b/src/cupoch/knn/lbvh_knn.h
@@ -41,7 +41,7 @@ using AABB = std::pair<Eigen::Vector3f, Eigen::Vector3f>;
 class LinearBoundingVolumeHierarchyKNN {
 public:
     LinearBoundingVolumeHierarchyKNN(size_t leaf_size = 32, bool compact = true, bool sort_queries = true, bool shrink_to_fit = false);
-    LinearBoundingVolumeHierarchyKNN(const geometry::Geometry &geometry);
+    LinearBoundingVolumeHierarchyKNN(const utility::device_vector<Eigen::Vector3f> &data, size_t leaf_size = 32, bool compact = true, bool sort_queries = true, bool shrink_to_fit = false);
     ~LinearBoundingVolumeHierarchyKNN();
     LinearBoundingVolumeHierarchyKNN(const LinearBoundingVolumeHierarchyKNN &) = delete;
     LinearBoundingVolumeHierarchyKNN &operator=(const LinearBoundingVolumeHierarchyKNN &) = delete;
@@ -60,6 +60,12 @@ class LinearBoundingVolumeHierarchyKNN {
                   utility::device_vector<unsigned int> &indices,
                   utility::device_vector<float> &distance2) const;
 
+    template <typename T>
+    int SearchKNN(const T &query,
+                  int knn,
+                  thrust::host_vector<unsigned int> &indices,
+                  thrust::host_vector<float> &distance2) const;
+
     template <typename T>
     bool SetRawData(const utility::device_vector<T> &data);
 

From db3f8e0ee2de00b5c65b0d69c94f0fabe5a5ed41 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Sun, 24 Sep 2023 10:00:35 +0900
Subject: [PATCH 06/23] fix bug

---
 src/cupoch/knn/lbvh_knn.cu | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/cupoch/knn/lbvh_knn.cu b/src/cupoch/knn/lbvh_knn.cu
index a69b0516..57523944 100644
--- a/src/cupoch/knn/lbvh_knn.cu
+++ b/src/cupoch/knn/lbvh_knn.cu
@@ -72,11 +72,14 @@ namespace cupoch {
 namespace knn {
 
 LinearBoundingVolumeHierarchyKNN::LinearBoundingVolumeHierarchyKNN(size_t leaf_size, bool compact, bool sort_queries, bool shrink_to_fit)
-    : leaf_size_(leaf_size), compact_(compact), sort_queries_(sort_queries), shrink_to_fit_(shrink_to_fit) {}
+    : leaf_size_(leaf_size), compact_(compact), sort_queries_(sort_queries), shrink_to_fit_(shrink_to_fit) {
+    nodes_ = std::make_unique<utility::device_vector<lbvh::BVHNode>>();
+}
 
 LinearBoundingVolumeHierarchyKNN::LinearBoundingVolumeHierarchyKNN(const utility::device_vector<Eigen::Vector3f> &data,
                                                                    size_t leaf_size, bool compact, bool sort_queries, bool shrink_to_fit)
     : leaf_size_(leaf_size), compact_(compact), sort_queries_(sort_queries), shrink_to_fit_(shrink_to_fit) {
+    nodes_ = std::make_unique<utility::device_vector<lbvh::BVHNode>>();
     SetRawData(data);
 }
 
@@ -121,15 +124,16 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
     T max_data = utility::ComputeMaxBound<T::SizeAtCompileTime, typename T::Scalar>(data);
     extent_.first = Eigen::Vector3f(min_data[0], min_data[1], min_data[2]);
     extent_.second = Eigen::Vector3f(max_data[0], max_data[1], max_data[2]);
+    auto extent_float3 = to_float3_aabb(extent_);
 
     utility::device_vector<unsigned long long int> morton_codes(n_points_);
     thrust::transform(
         aabbs.begin(), aabbs.end(), morton_codes.begin(),
-        [this] __device__ (const lbvh::AABB& aabb) { return lbvh::morton_code(aabb, to_float3_aabb(extent_)); });
+        [extent_float3] __device__ (const lbvh::AABB& aabb) { return lbvh::morton_code(aabb, extent_float3); });
     dim3 block_dim, grid_dim;
     std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(n_points_);
     compute_morton_points_kernel<<<grid_dim, block_dim>>>(
-        thrust::raw_pointer_cast(data_float3_.data()), to_float3_aabb(extent_), thrust::raw_pointer_cast(morton_codes.data()), n_points_);
+        thrust::raw_pointer_cast(data_float3_.data()), extent_float3, thrust::raw_pointer_cast(morton_codes.data()), n_points_);
     cudaSafeCall(cudaDeviceSynchronize());
     sorted_indices_.resize(morton_codes.size());
     thrust::sequence(sorted_indices_.begin(), sorted_indices_.end());
@@ -194,6 +198,7 @@ int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
                                                 utility::device_vector<unsigned int> &indices,
                                                 utility::device_vector<float> &distance2) const {
     size_t num_query = thrust::distance(first, last);
+    auto extent_float3 = to_float3_aabb(extent_);
     utility::device_vector<float3> data_float3(num_query);
     thrust::transform(first, last, data_float3.begin(), convert_float3_functor<Dim>());
 
@@ -204,7 +209,7 @@ int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
         dim3 block_dim, grid_dim;
         std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(num_query);
         compute_morton_points_kernel<<<grid_dim, block_dim>>>(
-            thrust::raw_pointer_cast(data_float3.data()), to_float3_aabb(extent_), thrust::raw_pointer_cast(morton_codes.data()), num_query);
+            thrust::raw_pointer_cast(data_float3.data()), extent_float3, thrust::raw_pointer_cast(morton_codes.data()), num_query);
         cudaSafeCall(cudaDeviceSynchronize());
         thrust::sort_by_key(morton_codes.begin(), morton_codes.end(), sorted_indices.begin());
     }

From 3b5f3eef0be7f8c04b7da8cdb1ee8797e722ee92 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Sun, 24 Sep 2023 17:54:44 +0900
Subject: [PATCH 07/23] add comments

---
 src/cupoch/knn/lbvh_knn.cu | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/cupoch/knn/lbvh_knn.cu b/src/cupoch/knn/lbvh_knn.cu
index 57523944..e23e4580 100644
--- a/src/cupoch/knn/lbvh_knn.cu
+++ b/src/cupoch/knn/lbvh_knn.cu
@@ -118,6 +118,10 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
     n_points_ = data.size();
     n_nodes_ = n_points_ * 2 - 1;
     data_float3_.resize(n_points_);
+    dim3 block_dim, grid_dim;
+    std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(n_points_);
+
+    // init aabbs
     utility::device_vector<lbvh::AABB> aabbs(n_points_);
     thrust::transform(data.begin(), data.end(), make_tuple_begin(aabbs, data_float3_), convert_float3_and_aabb_functor<T>());
     T min_data = utility::ComputeMinBound<T::SizeAtCompileTime, typename T::Scalar>(data);
@@ -126,19 +130,18 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
     extent_.second = Eigen::Vector3f(max_data[0], max_data[1], max_data[2]);
     auto extent_float3 = to_float3_aabb(extent_);
 
-    utility::device_vector<unsigned long long int> morton_codes(n_points_);
+    // compute the morton codes of the aabbs
+    utility::device_vector<lbvh::HashType> morton_codes(n_points_);
     thrust::transform(
         aabbs.begin(), aabbs.end(), morton_codes.begin(),
         [extent_float3] __device__ (const lbvh::AABB& aabb) { return lbvh::morton_code(aabb, extent_float3); });
-    dim3 block_dim, grid_dim;
-    std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(n_points_);
-    compute_morton_points_kernel<<<grid_dim, block_dim>>>(
-        thrust::raw_pointer_cast(data_float3_.data()), extent_float3, thrust::raw_pointer_cast(morton_codes.data()), n_points_);
-    cudaSafeCall(cudaDeviceSynchronize());
+
+    // sort everything by the morton codes
     sorted_indices_.resize(morton_codes.size());
     thrust::sequence(sorted_indices_.begin(), sorted_indices_.end());
     thrust::sort_by_key(morton_codes.begin(), morton_codes.end(), make_tuple_begin(sorted_indices_, aabbs));
 
+    // allocate space for the nodes as a raw cuda array
     nodes_->resize(n_nodes_);
     initialize_tree_kernel<<<grid_dim, block_dim>>>(
         thrust::raw_pointer_cast(nodes_->data()), thrust::raw_pointer_cast(aabbs.data()), n_points_);
@@ -157,6 +160,7 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
             thrust::raw_pointer_cast(root_node_index.data()),
             thrust::raw_pointer_cast(valid.data()), leaf_size_, n_points_);
         cudaSafeCall(cudaDeviceSynchronize());
+        // compact the tree to increase bandwidth
         if (compact_) {
             utility::device_vector<unsigned int> valid_sums(n_nodes_ + 1, 0);
             thrust::inclusive_scan(valid.begin(), valid.end(), valid_sums.begin() + 1);

From 0940a4525f6c9d31d6a946e193713fea10d00991 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Sun, 24 Sep 2023 20:59:02 +0900
Subject: [PATCH 08/23] fix bug

---
 src/cupoch/knn/lbvh_knn.cu     | 1 +
 src/cupoch/utility/platform.cu | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cupoch/knn/lbvh_knn.cu b/src/cupoch/knn/lbvh_knn.cu
index e23e4580..2e607656 100644
--- a/src/cupoch/knn/lbvh_knn.cu
+++ b/src/cupoch/knn/lbvh_knn.cu
@@ -117,6 +117,7 @@ template <typename T>
 bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T> &data) {
     n_points_ = data.size();
     n_nodes_ = n_points_ * 2 - 1;
+    dimension_ = T::SizeAtCompileTime;
     data_float3_.resize(n_points_);
     dim3 block_dim, grid_dim;
     std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(n_points_);
diff --git a/src/cupoch/utility/platform.cu b/src/cupoch/utility/platform.cu
index 73736cfd..5a56fed1 100644
--- a/src/cupoch/utility/platform.cu
+++ b/src/cupoch/utility/platform.cu
@@ -70,7 +70,7 @@ std::tuple<dim3, dim3> SelectBlockGridSizes(int data_size, int threads_per_block
     cudaDeviceProp prop;
     cupoch::utility::GetDeviceProp(prop);
     int max_threads_per_block = prop.maxThreadsPerBlock;
-    if (threads_per_block < 0) {
+    if (threads_per_block > 0) {
         if (threads_per_block > max_threads_per_block) {
             throw std::runtime_error("Threads per block exceeds device maximum.");
         } else {

From bfaa191c749890d4ed9ebddcaf5091884c4af118 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Sun, 24 Sep 2023 23:51:43 +0900
Subject: [PATCH 09/23] add lbvh search nn

---
 src/cupoch/knn/lbvh_knn.cu                    | 49 +++++------
 src/cupoch/knn/lbvh_knn.h                     | 26 +++---
 src/tests/knn/lbvh_knn.cpp                    | 86 +++++++++++++++++++
 third_party/lbvh_index/lbvh_index/query.cuh   |  3 +-
 .../lbvh_index/lbvh_index/query_knn.cuh       |  3 -
 5 files changed, 125 insertions(+), 42 deletions(-)
 create mode 100644 src/tests/knn/lbvh_knn.cpp

diff --git a/src/cupoch/knn/lbvh_knn.cu b/src/cupoch/knn/lbvh_knn.cu
index 2e607656..4414bbbe 100644
--- a/src/cupoch/knn/lbvh_knn.cu
+++ b/src/cupoch/knn/lbvh_knn.cu
@@ -28,6 +28,7 @@
 #include <lbvh_index/lbvh_kernels.cuh>
 #include <lbvh_index/query_knn_kernels.cuh>
 
+#include "cupoch/knn/kdtree_search_param.h"
 #include "cupoch/utility/eigen.h"
 #include "cupoch/utility/platform.h"
 #include "cupoch/utility/helper.h"
@@ -86,28 +87,28 @@ LinearBoundingVolumeHierarchyKNN::LinearBoundingVolumeHierarchyKNN(const utility
 LinearBoundingVolumeHierarchyKNN::~LinearBoundingVolumeHierarchyKNN() {}
 
 template <typename T>
-int LinearBoundingVolumeHierarchyKNN::SearchKNN(const utility::device_vector<T> &query,
-                                                int knn,
-                                                utility::device_vector<unsigned int> &indices,
-                                                utility::device_vector<float> &distance2) const{
-    if (query.empty() || n_points_ <= 0 || n_nodes_ <= 0 || knn < 0)
+int LinearBoundingVolumeHierarchyKNN::SearchNN(const utility::device_vector<T> &query,
+                                               float radius,
+                                               utility::device_vector<unsigned int> &indices,
+                                               utility::device_vector<float> &distance2) const{
+    if (query.empty() || n_points_ <= 0 || n_nodes_ <= 0)
         return -1;
     T query0 = query[0];
     if (size_t(query0.size()) != dimension_) return -1;
-    return SearchKNN<typename utility::device_vector<T>::const_iterator,
-                     T::RowsAtCompileTime>(query.begin(), query.end(), knn,
-                                           indices, distance2);
+    return SearchNN<typename utility::device_vector<T>::const_iterator,
+                    T::RowsAtCompileTime>(query.begin(), query.end(), radius,
+                                          indices, distance2);
 }
 
 template <typename T>
-int LinearBoundingVolumeHierarchyKNN::SearchKNN(const T &query,
-                           int knn,
+int LinearBoundingVolumeHierarchyKNN::SearchNN(const T &query,
+                           float radius,
                            thrust::host_vector<unsigned int> &indices,
                            thrust::host_vector<float> &distance2) const {
     utility::device_vector<T> query_dv(1, query);
     utility::device_vector<unsigned int> indices_dv;
     utility::device_vector<float> distance2_dv;
-    auto result = SearchKNN<T>(query_dv, knn, indices_dv, distance2_dv);
+    auto result = SearchNN<T>(query_dv, radius, indices_dv, distance2_dv);
     indices = indices_dv;
     distance2 = distance2_dv;
     return result;
@@ -197,17 +198,17 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
 }
 
 template <typename InputIterator, int Dim>
-int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
-                                                InputIterator last,
-                                                int knn,
-                                                utility::device_vector<unsigned int> &indices,
-                                                utility::device_vector<float> &distance2) const {
+int LinearBoundingVolumeHierarchyKNN::SearchNN(InputIterator first,
+                                               InputIterator last,
+                                               float radius,
+                                               utility::device_vector<unsigned int> &indices,
+                                               utility::device_vector<float> &distance2) const {
     size_t num_query = thrust::distance(first, last);
     auto extent_float3 = to_float3_aabb(extent_);
     utility::device_vector<float3> data_float3(num_query);
     thrust::transform(first, last, data_float3.begin(), convert_float3_functor<Dim>());
 
-    utility::device_vector<unsigned long long int> morton_codes(num_query);
+    utility::device_vector<lbvh::HashType> morton_codes(num_query);
     utility::device_vector<unsigned int> sorted_indices(num_query);
     thrust::sequence(sorted_indices.begin(), sorted_indices.end());
     if (sort_queries_) {
@@ -221,8 +222,8 @@ int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
 
     dim3 block_dim, grid_dim;
     std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(num_query);
-    indices.resize(num_query * knn, std::numeric_limits<unsigned int>::max());
-    distance2.resize(num_query * knn, std::numeric_limits<float>::max());
+    indices.resize(num_query, std::numeric_limits<unsigned int>::max());
+    distance2.resize(num_query, std::numeric_limits<float>::max());
     utility::device_vector<unsigned int> neighbors(num_query, 0);
 
     query_knn_kernel<<<grid_dim, block_dim>>>(
@@ -230,7 +231,7 @@ int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
         thrust::raw_pointer_cast(data_float3_.data()),
         thrust::raw_pointer_cast(sorted_indices_.data()),
         root_node_index_,
-        std::numeric_limits<float>::max(),
+        radius * radius,
         thrust::raw_pointer_cast(data_float3.data()),
         thrust::raw_pointer_cast(sorted_indices.data()),
         num_query,
@@ -241,15 +242,15 @@ int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
     return 1;
 }
 
-template int LinearBoundingVolumeHierarchyKNN::SearchKNN<Eigen::Vector3f>(
+template int LinearBoundingVolumeHierarchyKNN::SearchNN<Eigen::Vector3f>(
         const utility::device_vector<Eigen::Vector3f> &query,
-        int knn,
+        float radius,
         utility::device_vector<unsigned int> &indices,
         utility::device_vector<float> &distance2) const;
 
-template int LinearBoundingVolumeHierarchyKNN::SearchKNN<Eigen::Vector3f>(
+template int LinearBoundingVolumeHierarchyKNN::SearchNN<Eigen::Vector3f>(
         const Eigen::Vector3f &query,
-        int knn,
+        float radius,
         thrust::host_vector<unsigned int> &indices,
         thrust::host_vector<float> &distance2) const;
 
diff --git a/src/cupoch/knn/lbvh_knn.h b/src/cupoch/knn/lbvh_knn.h
index 0c786d29..aedc2bac 100644
--- a/src/cupoch/knn/lbvh_knn.h
+++ b/src/cupoch/knn/lbvh_knn.h
@@ -48,23 +48,23 @@ class LinearBoundingVolumeHierarchyKNN {
 
 public:
     template <typename InputIterator, int Dim>
-    int SearchKNN(InputIterator first,
-                  InputIterator last,
-                  int knn,
-                  utility::device_vector<unsigned int> &indices,
-                  utility::device_vector<float> &distance2) const;
+    int SearchNN(InputIterator first,
+                 InputIterator last,
+                 float radius,
+                 utility::device_vector<unsigned int> &indices,
+                 utility::device_vector<float> &distance2) const;
 
     template <typename T>
-    int SearchKNN(const utility::device_vector<T> &query,
-                  int knn,
-                  utility::device_vector<unsigned int> &indices,
-                  utility::device_vector<float> &distance2) const;
+    int SearchNN(const utility::device_vector<T> &query,
+                 float radius,
+                 utility::device_vector<unsigned int> &indices,
+                 utility::device_vector<float> &distance2) const;
 
     template <typename T>
-    int SearchKNN(const T &query,
-                  int knn,
-                  thrust::host_vector<unsigned int> &indices,
-                  thrust::host_vector<float> &distance2) const;
+    int SearchNN(const T &query,
+                 float radius,
+                 thrust::host_vector<unsigned int> &indices,
+                 thrust::host_vector<float> &distance2) const;
 
     template <typename T>
     bool SetRawData(const utility::device_vector<T> &data);
diff --git a/src/tests/knn/lbvh_knn.cpp b/src/tests/knn/lbvh_knn.cpp
new file mode 100644
index 00000000..d171ae48
--- /dev/null
+++ b/src/tests/knn/lbvh_knn.cpp
@@ -0,0 +1,86 @@
+/**
+ * Copyright (c) 2020 Neka-Nat
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+**/
+#include "cupoch/knn/lbvh_knn.h"
+
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+
+#include "cupoch/geometry/pointcloud.h"
+#include "cupoch/geometry/geometry_utils.h"
+#include "tests/test_utility/unit_test.h"
+
+using namespace Eigen;
+using namespace cupoch;
+using namespace std;
+using namespace unit_test;
+
+namespace {
+
+struct is_minus_one_functor {
+    bool operator()(int x) const { return (x == -1); }
+};
+
+struct is_inf_functor {
+    bool operator()(float x) const { return std::isinf(x); }
+};
+
+}  // namespace
+
+TEST(LinearBoundingVolumeHierarchyKNN, SearchKNN) {
+    thrust::host_vector<int> ref_indices;
+    int indices0[] = {27, 48, 4,  77, 90, 7,  54, 17, 76, 38,
+                      39, 60, 15, 84, 11, 57, 3,  32, 99, 36,
+                      52, 40, 26, 59, 22, 97, 20, 42, 73, 24};
+    for (int i = 0; i < 30; ++i) ref_indices.push_back(indices0[i]);
+
+    thrust::host_vector<float> ref_distance2;
+    float distances0[] = {
+            0.000000,  4.684353,  4.996539,  9.191849,  10.034604, 10.466745,
+            10.649751, 11.434066, 12.089195, 13.345638, 13.696270, 14.016148,
+            16.851978, 17.073435, 18.254518, 20.019994, 21.496347, 23.077277,
+            23.692427, 23.809303, 24.104578, 25.005770, 26.952710, 27.487888,
+            27.998463, 28.262975, 28.581313, 28.816608, 31.603230, 31.610916};
+    for (int i = 0; i < 30; ++i) ref_distance2.push_back(distances0[i]);
+
+    int size = 100;
+
+    geometry::PointCloud pc;
+
+    Vector3f vmin(0.0, 0.0, 0.0);
+    Vector3f vmax(10.0, 10.0, 10.0);
+
+    thrust::host_vector<Eigen::Vector3f> points(size);
+    Rand(points, vmin, vmax, 0);
+    pc.SetPoints(points);
+
+    knn::LinearBoundingVolumeHierarchyKNN kdtree(geometry::ConvertVector3fVectorRef(pc));
+
+    Eigen::Vector3f query = {1.647059, 4.392157, 8.784314};
+    thrust::host_vector<unsigned int> indices;
+    thrust::host_vector<float> distance2;
+
+    int result = kdtree.SearchNN(query, std::numeric_limits<float>::max(), indices, distance2);
+
+    EXPECT_EQ(result, 1);
+
+    EXPECT_EQ(ref_indices[0], indices[0]);
+    EXPECT_NEAR(ref_distance2[0], distance2[0], 1.0e-9);
+}
diff --git a/third_party/lbvh_index/lbvh_index/query.cuh b/third_party/lbvh_index/lbvh_index/query.cuh
index 76e44613..5fa2e50c 100644
--- a/third_party/lbvh_index/lbvh_index/query.cuh
+++ b/third_party/lbvh_index/lbvh_index/query.cuh
@@ -4,9 +4,8 @@
 #include "vec_math.h"
 
 // default is one nearest neighbor
-#ifndef K
 #define K 1
-#endif
+
 
 namespace lbvh {
 
diff --git a/third_party/lbvh_index/lbvh_index/query_knn.cuh b/third_party/lbvh_index/lbvh_index/query_knn.cuh
index 2626a8e9..e5ee095b 100644
--- a/third_party/lbvh_index/lbvh_index/query_knn.cuh
+++ b/third_party/lbvh_index/lbvh_index/query_knn.cuh
@@ -3,9 +3,6 @@
 #include "static_priorityqueue.cuh"
 
 // default is one nearest neighbor
-#ifndef K
-#define K 1
-#endif
 
 namespace lbvh {
     __device__ void query_knn(const BVHNode* __restrict__ nodes,

From 79ad0fa76b9c38f8c194d77f10a3188dd53d94a4 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Mon, 25 Sep 2023 00:26:54 +0900
Subject: [PATCH 10/23] docker base image update

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index cd941ffa..ba3439b0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 
-FROM nvidia/cuda:11.4.0-devel-ubuntu20.04
+FROM nvidia/cuda:11.4.3-devel-ubuntu20.04
 
 WORKDIR /work/cupoch
 

From ff13a0a0339f7079d72efc5b020774e1b8cc8363 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Mon, 25 Sep 2023 14:28:38 +0900
Subject: [PATCH 11/23] fix docker

---
 Dockerfile | 12 ++++++++++--
 README.md  | 21 ---------------------
 2 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ba3439b0..38039fad 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 
-FROM nvidia/cuda:11.4.3-devel-ubuntu20.04
+FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
 
 WORKDIR /work/cupoch
 
@@ -9,6 +9,7 @@ ENV TZ Asia/Tokyo
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
          curl \
+         wget \
          build-essential \
          libxinerama-dev \
          libxcursor-dev \
@@ -19,6 +20,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
          python3-setuptools && \
      rm -rf /var/lib/apt/lists/*
 
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.18.4/cmake-3.18.4.tar.gz \
+    && tar zxvf cmake-3.18.4.tar.gz \
+    && cd cmake-3.18.4 \
+    && ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF \
+    && make && make install
+
 RUN curl -sSL https://install.python-poetry.org | python3 -
 
 ENV PATH $PATH:/root/.local/bin
@@ -35,4 +42,5 @@ ENV PYTHONPATH $PYTHONPATH:/usr/lib/python3.8/site-packages
 RUN mkdir build \
     && cd build \
     && cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_GLEW=ON -DBUILD_GLFW=ON -DBUILD_PNG=ON -DBUILD_JSONCPP=ON \
-    && make install-pip-package
+    && make pip-package \
+    && pip install lib/python_package/pip_package/*.whl
diff --git a/README.md b/README.md
index c9c8fa36..acdf3c22 100644
--- a/README.md
+++ b/README.md
@@ -104,27 +104,6 @@ sudo make install-pip-package
 
 ### Use Docker
 
-Setting default container runtime to nvidia-container-runtime.
-Edit or create the `/etc/docker/daemon.json`.
-
-```sh
-{
-    "runtimes": {
-        "nvidia": {
-            "path": "/usr/bin/nvidia-container-runtime",
-            "runtimeArgs": []
-         }
-    },
-    "default-runtime": "nvidia"
-}
-```
-
-Restart docker daemon.
-
-```sh
-sudo systemctl restart docker
-```
-
 ```sh
 docker-compose up -d
 # xhost +

From ad76fd8ea249675f503bc99a6ffa486086021ea0 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Mon, 25 Sep 2023 15:42:35 +0900
Subject: [PATCH 12/23] update readme

---
 README.md | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index acdf3c22..21f2c568 100644
--- a/README.md
+++ b/README.md
@@ -123,11 +123,31 @@ The environment tested on has the following specs:
 
 You can get the result by running the example script in your environment.
 
-```
+```sh
 cd examples/python/basic
 python benchmarks.py
 ```
 
+If you get the following error when executing an example that includes 3D drawing, please start the program as follows.
+
+```sh
+$ cd examples/basic
+$ python visualization.py
+Load a ply point cloud, print it, and render it
+MESA: warning: Driver does not support the 0xa7a0 PCI ID.
+libGL error: failed to create dri screen
+libGL error: failed to load driver: iris
+MESA: warning: Driver does not support the 0xa7a0 PCI ID.
+libGL error: failed to create dri screen
+libGL error: failed to load driver: iris
+Error: unknown error	phong_shader.cu:330
+```
+
+```sh
+__NV_PRIME_RENDER_OFFLOAD=1 __GLX_VENDOR_LIBRARY_NAME=nvidia python visualization.py
+```
+
+
 ![speedup](https://raw.githubusercontent.com/neka-nat/cupoch/master/docs/_static/speedup.png)
 
 ### Visual odometry with intel realsense D435

From 8669d1ffd68a302756389893c1230166b58148f7 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Mon, 25 Sep 2023 22:47:00 +0900
Subject: [PATCH 13/23] small fix

---
 src/cupoch/knn/lbvh_knn.h             |  4 ++--
 src/cupoch/utility/pinned_allocator.h | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/cupoch/knn/lbvh_knn.h b/src/cupoch/knn/lbvh_knn.h
index aedc2bac..4f03739f 100644
--- a/src/cupoch/knn/lbvh_knn.h
+++ b/src/cupoch/knn/lbvh_knn.h
@@ -40,8 +40,8 @@ using AABB = std::pair<Eigen::Vector3f, Eigen::Vector3f>;
 
 class LinearBoundingVolumeHierarchyKNN {
 public:
-    LinearBoundingVolumeHierarchyKNN(size_t leaf_size = 32, bool compact = true, bool sort_queries = true, bool shrink_to_fit = false);
-    LinearBoundingVolumeHierarchyKNN(const utility::device_vector<Eigen::Vector3f> &data, size_t leaf_size = 32, bool compact = true, bool sort_queries = true, bool shrink_to_fit = false);
+    LinearBoundingVolumeHierarchyKNN(size_t leaf_size = 32, bool compact = false, bool sort_queries = false, bool shrink_to_fit = false);
+    LinearBoundingVolumeHierarchyKNN(const utility::device_vector<Eigen::Vector3f> &data, size_t leaf_size = 32, bool compact = false, bool sort_queries = false, bool shrink_to_fit = false);
     ~LinearBoundingVolumeHierarchyKNN();
     LinearBoundingVolumeHierarchyKNN(const LinearBoundingVolumeHierarchyKNN &) = delete;
     LinearBoundingVolumeHierarchyKNN &operator=(const LinearBoundingVolumeHierarchyKNN &) = delete;
diff --git a/src/cupoch/utility/pinned_allocator.h b/src/cupoch/utility/pinned_allocator.h
index 77c1986e..c126e048 100644
--- a/src/cupoch/utility/pinned_allocator.h
+++ b/src/cupoch/utility/pinned_allocator.h
@@ -28,6 +28,16 @@
 #include <thrust/system/system_error.h>
 #include <thrust/system/cuda/error.h>
 
+
+#ifndef THRUST_NAMESPACE_BEGIN
+#define THRUST_NAMESPACE_BEGIN namespace thrust {
+#endif // THRUST_NAMESPACE_BEGIN
+
+#ifndef THRUST_NAMESPACE_END
+#define THRUST_NAMESPACE_END }
+#endif // THRUST_NAMESPACE_END
+
+
 THRUST_NAMESPACE_BEGIN
 
 namespace system

From 744d5244341abdb049dc3e62e8d1c55db582d0be Mon Sep 17 00:00:00 2001
From: neka-nat <nekanat.stock@gmail.com>
Date: Sun, 26 Nov 2023 11:27:36 +0900
Subject: [PATCH 14/23] add stream function

---
 README.md                         |  2 +-
 src/cupoch/registration/kabsch.cu | 22 ++++++++++++++++++----
 src/cupoch/registration/kabsch.h  |  9 +++++++++
 src/cupoch/utility/eigen.h        | 10 ++++++++++
 src/cupoch/utility/eigen.inl      | 19 +++++++++++++++++++
 5 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 21f2c568..d788b43f 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ You can also install cupoch using pip on Jetson Nano.
 Please set up Jetson using [jetpack](https://developer.nvidia.com/embedded/jetpack) and install some packages with apt.
 
 ```
-sudo apt-get install libxinerama-dev libxcursor-dev libglu1-mesa-dev
+sudo apt-get install xorg-dev libxinerama-dev libxcursor-dev libglu1-mesa-dev
 pip3 install cupoch
 ```
 
diff --git a/src/cupoch/registration/kabsch.cu b/src/cupoch/registration/kabsch.cu
index c336bd53..ae806207 100644
--- a/src/cupoch/registration/kabsch.cu
+++ b/src/cupoch/registration/kabsch.cu
@@ -32,13 +32,21 @@
 using namespace cupoch;
 using namespace cupoch::registration;
 
+Eigen::Matrix4f_u cupoch::registration::Kabsch(const utility::device_vector<Eigen::Vector3f> &model,
+                         const utility::device_vector<Eigen::Vector3f> &target,
+                         const CorrespondenceSet &corres) {
+    return Kabsch(utility::GetStream(0), utility::GetStream(1), model, target,
+                  corres);
+}
+
 Eigen::Matrix4f_u cupoch::registration::Kabsch(
+        cudaStream_t stream1, cudaStream_t stream2,
         const utility::device_vector<Eigen::Vector3f> &model,
         const utility::device_vector<Eigen::Vector3f> &target,
         const CorrespondenceSet &corres) {
     // Compute the center
     auto res1 = thrust::async::reduce(
-            utility::exec_policy(utility::GetStream(0)),
+            utility::exec_policy(stream1),
             thrust::make_permutation_iterator(
                     model.begin(),
                     thrust::make_transform_iterator(
@@ -51,7 +59,7 @@ Eigen::Matrix4f_u cupoch::registration::Kabsch(
                             element_get_functor<Eigen::Vector2i, 0>())),
             Eigen::Vector3f(0.0, 0.0, 0.0), thrust::plus<Eigen::Vector3f>());
     auto res2 = thrust::async::reduce(
-            utility::exec_policy(utility::GetStream(1)),
+            utility::exec_policy(stream2),
             thrust::make_permutation_iterator(
                     target.begin(),
                     thrust::make_transform_iterator(
@@ -73,7 +81,7 @@ Eigen::Matrix4f_u cupoch::registration::Kabsch(
     // Compute the H matrix
     const Eigen::Matrix3f init = Eigen::Matrix3f::Zero();
     Eigen::Matrix3f hh = thrust::inner_product(
-            utility::exec_policy(0),
+            utility::exec_policy(stream1),
             thrust::make_permutation_iterator(
                     model.begin(),
                     thrust::make_transform_iterator(
@@ -111,14 +119,20 @@ Eigen::Matrix4f_u cupoch::registration::Kabsch(
     return tr;
 }
 
+Eigen::Matrix4f_u cupoch::registration::Kabsch(const utility::device_vector<Eigen::Vector3f> &model,
+                         const utility::device_vector<Eigen::Vector3f> &target) {
+    return Kabsch(utility::GetStream(0), utility::GetStream(1), model, target);
+}
+
 Eigen::Matrix4f_u cupoch::registration::Kabsch(
+        cudaStream_t stream1, cudaStream_t stream2,
         const utility::device_vector<Eigen::Vector3f> &model,
         const utility::device_vector<Eigen::Vector3f> &target) {
     CorrespondenceSet corres(model.size());
     thrust::tabulate(corres.begin(), corres.end(), [] __device__(size_t idx) {
         return Eigen::Vector2i(idx, idx);
     });
-    return Kabsch(model, target, corres);
+    return Kabsch(stream1, stream2, model, target, corres);
 }
 
 Eigen::Matrix4f_u cupoch::registration::KabschWeighted(
diff --git a/src/cupoch/registration/kabsch.h b/src/cupoch/registration/kabsch.h
index ecab45f9..4c4bfc10 100644
--- a/src/cupoch/registration/kabsch.h
+++ b/src/cupoch/registration/kabsch.h
@@ -30,9 +30,18 @@ namespace registration {
 Eigen::Matrix4f_u Kabsch(const utility::device_vector<Eigen::Vector3f> &model,
                          const utility::device_vector<Eigen::Vector3f> &target,
                          const CorrespondenceSet &corres);
+Eigen::Matrix4f_u Kabsch(cudaStream_t stream1,
+                         cudaStream_t stream2,
+                         const utility::device_vector<Eigen::Vector3f> &model,
+                         const utility::device_vector<Eigen::Vector3f> &target,
+                         const CorrespondenceSet &corres);
 
 Eigen::Matrix4f_u Kabsch(const utility::device_vector<Eigen::Vector3f> &model,
                          const utility::device_vector<Eigen::Vector3f> &target);
+Eigen::Matrix4f_u Kabsch(cudaStream_t stream1,
+                         cudaStream_t stream2,
+                         const utility::device_vector<Eigen::Vector3f> &model,
+                         const utility::device_vector<Eigen::Vector3f> &target);
 
 Eigen::Matrix4f_u KabschWeighted(
         const utility::device_vector<Eigen::Vector3f> &model,
diff --git a/src/cupoch/utility/eigen.h b/src/cupoch/utility/eigen.h
index 9bdd2e58..2e7f2766 100644
--- a/src/cupoch/utility/eigen.h
+++ b/src/cupoch/utility/eigen.h
@@ -86,6 +86,11 @@ template <typename MatType, typename VecType, typename FuncType>
 thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(const FuncType &f,
                                                         int iteration_num,
                                                         bool verbose = true);
+template <typename MatType, typename VecType, typename FuncType>
+thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(cudaStream_t stream,
+                                                        const FuncType &f,
+                                                        int iteration_num,
+                                                        bool verbose = true);
 
 /// Function to compute JTJ and Jtr
 /// Input: function pointer f and total number of rows of Jacobian matrix
@@ -96,6 +101,11 @@ template <typename MatType, typename VecType, int NumJ, typename FuncType>
 thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(const FuncType &f,
                                                         int iteration_num,
                                                         bool verbose = true);
+template <typename MatType, typename VecType, int NumJ, typename FuncType>
+thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(cudaStream_t stream,
+                                                        FuncType &f,
+                                                        int iteration_num,
+                                                        bool verbose = true);
 
 template <typename MatType,
           typename VecType,
diff --git a/src/cupoch/utility/eigen.inl b/src/cupoch/utility/eigen.inl
index dc650034..ea7016a9 100644
--- a/src/cupoch/utility/eigen.inl
+++ b/src/cupoch/utility/eigen.inl
@@ -80,10 +80,20 @@ struct wrapped_calc_weights_functor {
 
 }  // namespace
 
+
 template <typename MatType, typename VecType, typename FuncType>
 thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(const FuncType &f,
                                                         int iteration_num,
                                                         bool verbose) {
+    return ComputeJTJandJTr<MatType, VecType, FuncType>(
+            0, f, iteration_num, verbose);
+}
+
+template <typename MatType, typename VecType, typename FuncType>
+thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(cudaStream_t stream,
+                                                        const FuncType &f,
+                                                        int iteration_num,
+                                                        bool verbose) {
     MatType JTJ;
     VecType JTr;
     float r2_sum = 0.0;
@@ -91,6 +101,7 @@ thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(const FuncType &f,
     JTr.setZero();
     jtj_jtr_functor<MatType, VecType, FuncType> func(f);
     auto jtj_jtr_r2 = thrust::transform_reduce(
+            utility::exec_policy(stream),
             thrust::make_counting_iterator(0),
             thrust::make_counting_iterator(iteration_num), func,
             thrust::make_tuple(JTJ, JTr, r2_sum),
@@ -106,6 +117,13 @@ thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(const FuncType &f,
 template <typename MatType, typename VecType, int NumJ, typename FuncType>
 thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(
         const FuncType &f, int iteration_num, bool verbose /*=true*/) {
+    return ComputeJTJandJTr<MatType, VecType, NumJ, FuncType>(
+            0, f, iteration_num, verbose);
+}
+
+template <typename MatType, typename VecType, int NumJ, typename FuncType>
+thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(
+        cudaStream_t stream, const FuncType &f, int iteration_num, bool verbose /*=true*/) {
     MatType JTJ;
     VecType JTr;
     float r2_sum = 0.0;
@@ -113,6 +131,7 @@ thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(
     JTr.setZero();
     multiple_jtj_jtr_functor<MatType, VecType, NumJ, FuncType> func(f);
     auto jtj_jtr_r2 = thrust::transform_reduce(
+            utility::exec_policy(stream),
             thrust::make_counting_iterator(0),
             thrust::make_counting_iterator(iteration_num), func,
             thrust::make_tuple(JTJ, JTr, r2_sum),

From 03c4c2c4cc19676714af38f6bf19c2bb0a19e66c Mon Sep 17 00:00:00 2001
From: friendship1 <friendship1@dgist.ac.kr>
Date: Wed, 29 Nov 2023 23:27:29 +0900
Subject: [PATCH 15/23] fix bug when cropping with oriented bbox

---
 src/cupoch/geometry/boundingvolume.cu | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/cupoch/geometry/boundingvolume.cu b/src/cupoch/geometry/boundingvolume.cu
index e3622d61..d9d34057 100644
--- a/src/cupoch/geometry/boundingvolume.cu
+++ b/src/cupoch/geometry/boundingvolume.cu
@@ -51,7 +51,7 @@ struct check_within_oriented_bounding_box_functor {
     __device__ bool operator()(
             const thrust::tuple<int, Eigen::Vector3f> &x) const {
         const Eigen::Vector3f &point = thrust::get<1>(x);
-        return (test_plane(box_points_[0], box_points_[1], box_points_[3],
+        return ((test_plane(box_points_[0], box_points_[1], box_points_[3],
                            point) <= 0 &&
                 test_plane(box_points_[0], box_points_[5], box_points_[3],
                            point) >= 0 &&
@@ -62,7 +62,19 @@ struct check_within_oriented_bounding_box_functor {
                 test_plane(box_points_[3], box_points_[4], box_points_[5],
                            point) <= 0 &&
                 test_plane(box_points_[0], box_points_[1], box_points_[7],
-                           point) >= 0);
+                           point) >= 0) ||
+                (test_plane(box_points_[0], box_points_[1], box_points_[3],
+                           point) >= 0 &&
+                test_plane(box_points_[0], box_points_[5], box_points_[3],
+                           point) <= 0 &&
+                test_plane(box_points_[2], box_points_[5], box_points_[7],
+                           point) >= 0 &&
+                test_plane(box_points_[1], box_points_[4], box_points_[7],
+                           point) <= 0 &&
+                test_plane(box_points_[3], box_points_[4], box_points_[5],
+                           point) >= 0 &&
+                test_plane(box_points_[0], box_points_[1], box_points_[7],
+                           point) <= 0));
     }
 };
 

From 1010f99ac5b35a0b85efe59725dc4a68e50cb167 Mon Sep 17 00:00:00 2001
From: neka-nat <nekanat.stock@gmail.com>
Date: Thu, 30 Nov 2023 00:17:12 +0900
Subject: [PATCH 16/23] v0.2.9.0

---
 src/cupoch/geometry/boundingvolume.cu | 2 +-
 src/cupoch/version.txt                | 4 ++--
 src/python/pyproject.toml             | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cupoch/geometry/boundingvolume.cu b/src/cupoch/geometry/boundingvolume.cu
index d9d34057..f3f9a4ca 100644
--- a/src/cupoch/geometry/boundingvolume.cu
+++ b/src/cupoch/geometry/boundingvolume.cu
@@ -42,7 +42,7 @@ struct check_within_oriented_bounding_box_functor {
     const std::array<Eigen::Vector3f, 8> box_points_;
     __device__ float test_plane(const Eigen::Vector3f &a,
                                 const Eigen::Vector3f &b,
-                                const Eigen::Vector3f c,
+                                const Eigen::Vector3f &c,
                                 const Eigen::Vector3f &x) const {
         Eigen::Matrix3f design;
         design << (b - a), (c - a), (x - a);
diff --git a/src/cupoch/version.txt b/src/cupoch/version.txt
index 0637c5c7..7d84b288 100644
--- a/src/cupoch/version.txt
+++ b/src/cupoch/version.txt
@@ -1,4 +1,4 @@
 CUPOCH_VERSION_MAJOR 0
 CUPOCH_VERSION_MINOR 2
-CUPOCH_VERSION_PATCH 8
-CUPOCH_VERSION_TWEAK 1
+CUPOCH_VERSION_PATCH 9
+CUPOCH_VERSION_TWEAK 0
diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml
index 4198b363..96581aee 100644
--- a/src/python/pyproject.toml
+++ b/src/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cupoch"
-version = "0.2.8.1"
+version = "0.2.9.0"
 description = ""
 authors = ["nekanat <nekanat.stock@gmail.com>"]
 license = "MIT"

From b84ea3aa5a5bd9d79cdeba201c22b6c05fc45662 Mon Sep 17 00:00:00 2001
From: "sewon.jeon" <irocks0922@gmail.com>
Date: Fri, 8 Dec 2023 18:55:20 +0900
Subject: [PATCH 17/23] symmetric icp

---
 .../registration/transformation_estimation.cu | 147 +++++++++++++++++-
 .../registration/transformation_estimation.h  |  30 +++-
 .../registration/registration.cpp             |  31 +++-
 3 files changed, 200 insertions(+), 8 deletions(-)

diff --git a/src/cupoch/registration/transformation_estimation.cu b/src/cupoch/registration/transformation_estimation.cu
index daad683e..a99713a2 100644
--- a/src/cupoch/registration/transformation_estimation.cu
+++ b/src/cupoch/registration/transformation_estimation.cu
@@ -55,6 +55,52 @@ struct pt2pl_jacobian_residual_functor
     }
 };
 
+struct pl2pl_jacobian_residual_functor
+    : public utility::jacobian_residual_functor<Eigen::Vector6f> {
+    pl2pl_jacobian_residual_functor(const Eigen::Vector3f *source,
+                                    const Eigen::Vector3f *source_normals,
+                                    const Eigen::Vector3f *target_points,
+                                    const Eigen::Vector3f *target_normals,
+                                    const Eigen::Vector2i *corres)
+        : source_(source),
+          source_normals_(source_normals),
+          target_points_(target_points),
+          target_normals_(target_normals),
+          corres_(corres){};
+    const Eigen::Vector3f *source_;
+    const Eigen::Vector3f *source_normals_;
+    const Eigen::Vector3f *target_points_;
+    const Eigen::Vector3f *target_normals_;
+    const Eigen::Vector2i *corres_;
+    __device__ void operator()(int idx, Eigen::Vector6f &vec, float &r) const {
+        const Eigen::Vector3f &vs = source_[corres_[idx][0]];
+        const Eigen::Vector3f &ns = source_normals_[corres_[idx][0]];
+        const Eigen::Vector3f &vt = target_points_[corres_[idx][1]];
+        const Eigen::Vector3f &nt = target_normals_[corres_[idx][1]];
+
+        Eigen::Vector3f n = ns + nt;  // sum of normals
+        Eigen::Vector3f d = vs - vt;  // difference of points
+
+        r = d.dot(n);  // symmetric residual
+
+        Eigen::Vector3f cross_product = (vs + vt).cross(n);
+        vec.block<3, 1>(0, 0) = cross_product;
+        vec.block<3, 1>(3, 0) = n;
+    }
+};
+
+// Define a function or a lambda to compute the error using normals
+__device__ float ComputeErrorUsingNormals(
+        const Eigen::Vector3f &source_point,
+        const Eigen::Vector3f &source_normal,
+        const Eigen::Vector3f &target_point,
+        const Eigen::Vector3f &target_normal) {
+    // Implement the error calculation logic here
+    // This is just a placeholder logic
+    return (source_point - target_point).dot(target_normal) +
+           (source_point - target_point).dot(source_normal);
+}
+
 }  // namespace
 
 float TransformationEstimationPointToPoint::ComputeRMSE(
@@ -114,7 +160,8 @@ float TransformationEstimationPointToPlane::ComputeRMSE(
                             target.normals_.begin(),
                             thrust::make_transform_iterator(
                                     corres.begin(),
-                                    element_get_functor<Eigen::Vector2i, 1>()))),
+                                    element_get_functor<Eigen::Vector2i,
+                                                        1>()))),
             make_tuple_iterator(
                     thrust::make_permutation_iterator(
                             source.points_.begin(),
@@ -130,7 +177,8 @@ float TransformationEstimationPointToPlane::ComputeRMSE(
                             target.normals_.begin(),
                             thrust::make_transform_iterator(
                                     corres.end(),
-                                    element_get_functor<Eigen::Vector2i, 1>()))),
+                                    element_get_functor<Eigen::Vector2i,
+                                                        1>()))),
             [] __device__(const thrust::tuple<Eigen::Vector3f, Eigen::Vector3f,
                                               Eigen::Vector3f> &x) {
                 float r = (thrust::get<0>(x) - thrust::get<1>(x))
@@ -167,5 +215,100 @@ Eigen::Matrix4f TransformationEstimationPointToPlane::ComputeTransformation(
             utility::SolveJacobianSystemAndObtainExtrinsicMatrix(JTJ, JTr,
                                                                  det_thresh_);
 
+    return is_success ? extrinsic : Eigen::Matrix4f::Identity();
+}
+
+float TransformationEstimationPlaneToPlane::ComputeRMSE(
+        const geometry::PointCloud &source,
+        const geometry::PointCloud &target,
+        const CorrespondenceSet &corres) const {
+    if (corres.empty() || !source.HasNormals() || !target.HasNormals())
+        return 0.0;
+    const float err = thrust::transform_reduce(
+            utility::exec_policy(0),
+            make_tuple_iterator(
+                    thrust::make_permutation_iterator(
+                            source.points_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.begin(),
+                                    element_get_functor<Eigen::Vector2i, 0>())),
+                    thrust::make_permutation_iterator(
+                            source.normals_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.begin(),
+                                    element_get_functor<Eigen::Vector2i, 0>())),
+                    thrust::make_permutation_iterator(
+                            target.points_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.begin(),
+                                    element_get_functor<Eigen::Vector2i, 1>())),
+                    thrust::make_permutation_iterator(
+                            target.normals_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.begin(),
+                                    element_get_functor<Eigen::Vector2i,
+                                                        1>()))),
+            make_tuple_iterator(
+                    thrust::make_permutation_iterator(
+                            source.points_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.end(),
+                                    element_get_functor<Eigen::Vector2i, 0>())),
+                    thrust::make_permutation_iterator(
+                            source.normals_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.end(),
+                                    element_get_functor<Eigen::Vector2i, 0>())),
+                    thrust::make_permutation_iterator(
+                            target.points_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.end(),
+                                    element_get_functor<Eigen::Vector2i, 1>())),
+                    thrust::make_permutation_iterator(
+                            target.normals_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.end(),
+                                    element_get_functor<Eigen::Vector2i,
+                                                        1>()))),
+            [] __device__(
+                    const thrust::tuple<Eigen::Vector3f, Eigen::Vector3f,
+                                        Eigen::Vector3f, Eigen::Vector3f> &x) {
+                // Compute error using both source and target normals
+                float r = ComputeErrorUsingNormals(
+                        thrust::get<0>(x), thrust::get<1>(x), thrust::get<2>(x),
+                        thrust::get<3>(x));
+                return r * r;
+            },
+            0.0f, thrust::plus<float>());
+    return std::sqrt(err / (float)corres.size());
+}
+
+Eigen::Matrix4f TransformationEstimationPlaneToPlane::ComputeTransformation(
+        const geometry::PointCloud &source,
+        const geometry::PointCloud &target,
+        const CorrespondenceSet &corres) const {
+    if (corres.empty() || !target.HasNormals())
+        return Eigen::Matrix4f::Identity();
+
+    Eigen::Matrix6f JTJ;
+    Eigen::Vector6f JTr;
+    float r2;
+    pl2pl_jacobian_residual_functor func(
+            thrust::raw_pointer_cast(source.points_.data()),
+            thrust::raw_pointer_cast(source.normals_.data()),
+            thrust::raw_pointer_cast(target.points_.data()),
+            thrust::raw_pointer_cast(target.normals_.data()),
+            thrust::raw_pointer_cast(corres.data()));
+    thrust::tie(JTJ, JTr, r2) =
+            utility::ComputeJTJandJTr<Eigen::Matrix6f, Eigen::Vector6f,
+                                      pl2pl_jacobian_residual_functor>(
+                    func, (int)corres.size());
+
+    bool is_success;
+    Eigen::Matrix4f extrinsic;
+    thrust::tie(is_success, extrinsic) =
+            utility::SolveJacobianSystemAndObtainExtrinsicMatrix(JTJ, JTr,
+                                                                 det_thresh_);
+
     return is_success ? extrinsic : Eigen::Matrix4f::Identity();
 }
\ No newline at end of file
diff --git a/src/cupoch/registration/transformation_estimation.h b/src/cupoch/registration/transformation_estimation.h
index a0aa262b..84949248 100644
--- a/src/cupoch/registration/transformation_estimation.h
+++ b/src/cupoch/registration/transformation_estimation.h
@@ -39,7 +39,8 @@ enum class TransformationEstimationType {
     Unspecified = 0,
     PointToPoint = 1,
     PointToPlane = 2,
-    ColoredICP = 3,
+    PlaneToPlane = 3,
+    ColoredICP = 4,
 };
 
 /// Base class that estimates a transformation between two point clouds
@@ -113,5 +114,32 @@ class TransformationEstimationPointToPlane : public TransformationEstimation {
             TransformationEstimationType::PointToPlane;
 };
 
+/// Estimate a transformation for plane to plane distance
+class TransformationEstimationPlaneToPlane : public TransformationEstimation {
+public:
+    TransformationEstimationPlaneToPlane(float det_thresh = 1.0e-6)
+        : det_thresh_(det_thresh) {}
+    ~TransformationEstimationPlaneToPlane() override {}
+
+public:
+    TransformationEstimationType GetTransformationEstimationType()
+            const override {
+        return type_;
+    };
+    float ComputeRMSE(const geometry::PointCloud &source,
+                      const geometry::PointCloud &target,
+                      const CorrespondenceSet &corres) const override;
+    Eigen::Matrix4f ComputeTransformation(
+            const geometry::PointCloud &source,
+            const geometry::PointCloud &target,
+            const CorrespondenceSet &corres) const override;
+
+    float det_thresh_;
+
+private:
+    const TransformationEstimationType type_ =
+            TransformationEstimationType::PlaneToPlane;
+};
+
 }  // namespace registration
 }  // namespace cupoch
\ No newline at end of file
diff --git a/src/python/cupoch_pybind/registration/registration.cpp b/src/python/cupoch_pybind/registration/registration.cpp
index 2990b2b2..853eb31c 100644
--- a/src/python/cupoch_pybind/registration/registration.cpp
+++ b/src/python/cupoch_pybind/registration/registration.cpp
@@ -119,9 +119,10 @@ void pybind_registration_classes(py::module &m) {
 
     py::enum_<registration::TransformationEstimationType> tf_type(m, "TransformationEstimationType");
     tf_type.value("PointToPoint", registration::TransformationEstimationType::PointToPoint)
-           .value("PointToPlane", registration::TransformationEstimationType::PointToPlane)
-           .value("ColoredICP", registration::TransformationEstimationType::ColoredICP)
-           .export_values();
+            .value("PointToPlane", registration::TransformationEstimationType::PointToPlane)
+            .value("PlaneToPlane", registration::TransformationEstimationType::PlaneToPlane)
+            .value("ColoredICP", registration::TransformationEstimationType::ColoredICP)
+            .export_values();
 
     // cupoch.registration.TransformationEstimationPointToPoint:
     // TransformationEstimation
@@ -164,6 +165,25 @@ void pybind_registration_classes(py::module &m) {
                 return std::string("TransformationEstimationPointToPlane");
             });
 
+    // cupoch.registration.TransformationEstimationPlaneToPlane:
+    // TransformationEstimation
+    py::class_<registration::TransformationEstimationPlaneToPlane,
+               PyTransformationEstimation<
+                       registration::TransformationEstimationPlaneToPlane>,
+               registration::TransformationEstimation>
+            te_l2l(m, "TransformationEstimationPlaneToPlane",
+                   "Class to estimate a transformation for plane to plane "
+                   "distance.");
+    py::detail::bind_default_constructor<
+            registration::TransformationEstimationPlaneToPlane>(te_l2l);
+    py::detail::bind_copy_functions<
+            registration::TransformationEstimationPlaneToPlane>(te_l2l);
+    te_l2l.def(
+            "__repr__",
+            [](const registration::TransformationEstimationPlaneToPlane &te) {
+                return std::string("TransformationEstimationPlaneToPlane");
+            });
+
     // cupoch.registration.FastGlobalRegistrationOption:
     py::class_<registration::FastGlobalRegistrationOption> fgr_option(
             m, "FastGlobalRegistrationOption",
@@ -351,8 +371,9 @@ static const std::unordered_map<std::string, std::string>
                 {"criteria", "Convergence criteria"},
                 {"estimation_method",
                  "Estimation method. One of "
-                 "(``registration::TransformationEstimationPointToPoint``, "
-                 "``registration::TransformationEstimationPointToPlane``)"},
+                 "(``registration::TransformationEstimationPointToPoint``,"
+                 "``registration::TransformationEstimationPointToPlane``,"
+                 "``registration::TransformationEstimationPlaneToPlane``)"},
                 {"init", "Initial transformation estimation"},
                 {"lambda_geometric", "lambda_geometric value"},
                 {"max_correspondence_distance",

From f85a6a31b6f65696ed5e42f9009d9f5e6ef6c04c Mon Sep 17 00:00:00 2001
From: "sewon.jeon" <sewon.jeon@connecteve.com>
Date: Fri, 8 Dec 2023 22:44:57 +0900
Subject: [PATCH 18/23] Rename to SymmetricMethod

---
 .../registration/transformation_estimation.cu |  4 ++--
 .../registration/transformation_estimation.h  | 10 +++++-----
 .../registration/registration.cpp             | 20 +++++++++----------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/cupoch/registration/transformation_estimation.cu b/src/cupoch/registration/transformation_estimation.cu
index a99713a2..a50934c3 100644
--- a/src/cupoch/registration/transformation_estimation.cu
+++ b/src/cupoch/registration/transformation_estimation.cu
@@ -218,7 +218,7 @@ Eigen::Matrix4f TransformationEstimationPointToPlane::ComputeTransformation(
     return is_success ? extrinsic : Eigen::Matrix4f::Identity();
 }
 
-float TransformationEstimationPlaneToPlane::ComputeRMSE(
+float TransformationEstimationSymmetricMethod::ComputeRMSE(
         const geometry::PointCloud &source,
         const geometry::PointCloud &target,
         const CorrespondenceSet &corres) const {
@@ -283,7 +283,7 @@ float TransformationEstimationPlaneToPlane::ComputeRMSE(
     return std::sqrt(err / (float)corres.size());
 }
 
-Eigen::Matrix4f TransformationEstimationPlaneToPlane::ComputeTransformation(
+Eigen::Matrix4f TransformationEstimationSymmetricMethod::ComputeTransformation(
         const geometry::PointCloud &source,
         const geometry::PointCloud &target,
         const CorrespondenceSet &corres) const {
diff --git a/src/cupoch/registration/transformation_estimation.h b/src/cupoch/registration/transformation_estimation.h
index 84949248..fe344abd 100644
--- a/src/cupoch/registration/transformation_estimation.h
+++ b/src/cupoch/registration/transformation_estimation.h
@@ -39,7 +39,7 @@ enum class TransformationEstimationType {
     Unspecified = 0,
     PointToPoint = 1,
     PointToPlane = 2,
-    PlaneToPlane = 3,
+    SymmetricMethod = 3,
     ColoredICP = 4,
 };
 
@@ -115,11 +115,11 @@ class TransformationEstimationPointToPlane : public TransformationEstimation {
 };
 
 /// Estimate a transformation for plane to plane distance
-class TransformationEstimationPlaneToPlane : public TransformationEstimation {
+class TransformationEstimationSymmetricMethod : public TransformationEstimation {
 public:
-    TransformationEstimationPlaneToPlane(float det_thresh = 1.0e-6)
+    TransformationEstimationSymmetricMethod(float det_thresh = 1.0e-6)
         : det_thresh_(det_thresh) {}
-    ~TransformationEstimationPlaneToPlane() override {}
+    ~TransformationEstimationSymmetricMethod() override {}
 
 public:
     TransformationEstimationType GetTransformationEstimationType()
@@ -138,7 +138,7 @@ class TransformationEstimationPlaneToPlane : public TransformationEstimation {
 
 private:
     const TransformationEstimationType type_ =
-            TransformationEstimationType::PlaneToPlane;
+            TransformationEstimationType::SymmetricMethod;
 };
 
 }  // namespace registration
diff --git a/src/python/cupoch_pybind/registration/registration.cpp b/src/python/cupoch_pybind/registration/registration.cpp
index 853eb31c..f00ac38d 100644
--- a/src/python/cupoch_pybind/registration/registration.cpp
+++ b/src/python/cupoch_pybind/registration/registration.cpp
@@ -120,7 +120,7 @@ void pybind_registration_classes(py::module &m) {
     py::enum_<registration::TransformationEstimationType> tf_type(m, "TransformationEstimationType");
     tf_type.value("PointToPoint", registration::TransformationEstimationType::PointToPoint)
             .value("PointToPlane", registration::TransformationEstimationType::PointToPlane)
-            .value("PlaneToPlane", registration::TransformationEstimationType::PlaneToPlane)
+            .value("SymmetricMethod", registration::TransformationEstimationType::SymmetricMethod)
             .value("ColoredICP", registration::TransformationEstimationType::ColoredICP)
             .export_values();
 
@@ -165,23 +165,23 @@ void pybind_registration_classes(py::module &m) {
                 return std::string("TransformationEstimationPointToPlane");
             });
 
-    // cupoch.registration.TransformationEstimationPlaneToPlane:
+    // cupoch.registration.TransformationEstimationSymmetricMethod:
     // TransformationEstimation
-    py::class_<registration::TransformationEstimationPlaneToPlane,
+    py::class_<registration::TransformationEstimationSymmetricMethod,
                PyTransformationEstimation<
-                       registration::TransformationEstimationPlaneToPlane>,
+                       registration::TransformationEstimationSymmetricMethod>,
                registration::TransformationEstimation>
-            te_l2l(m, "TransformationEstimationPlaneToPlane",
+            te_l2l(m, "TransformationEstimationSymmetricMethod",
                    "Class to estimate a transformation for plane to plane "
                    "distance.");
     py::detail::bind_default_constructor<
-            registration::TransformationEstimationPlaneToPlane>(te_l2l);
+            registration::TransformationEstimationSymmetricMethod>(te_l2l);
     py::detail::bind_copy_functions<
-            registration::TransformationEstimationPlaneToPlane>(te_l2l);
+            registration::TransformationEstimationSymmetricMethod>(te_l2l);
     te_l2l.def(
             "__repr__",
-            [](const registration::TransformationEstimationPlaneToPlane &te) {
-                return std::string("TransformationEstimationPlaneToPlane");
+            [](const registration::TransformationEstimationSymmetricMethod &te) {
+                return std::string("TransformationEstimationSymmetricMethod");
             });
 
     // cupoch.registration.FastGlobalRegistrationOption:
@@ -373,7 +373,7 @@ static const std::unordered_map<std::string, std::string>
                  "Estimation method. One of "
                  "(``registration::TransformationEstimationPointToPoint``,"
                  "``registration::TransformationEstimationPointToPlane``,"
-                 "``registration::TransformationEstimationPlaneToPlane``)"},
+                 "``registration::TransformationEstimationSymmetricMethod``)"},
                 {"init", "Initial transformation estimation"},
                 {"lambda_geometric", "lambda_geometric value"},
                 {"max_correspondence_distance",

From e411205f843780f75ba5ec03dd64e3ee9a7fb8f8 Mon Sep 17 00:00:00 2001
From: neka-nat <nekanat.stock@gmail.com>
Date: Fri, 8 Dec 2023 22:53:43 +0900
Subject: [PATCH 19/23] update README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index d788b43f..32e828d2 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@ This repository is based on [Open3D](https://github.com/intel-isl/Open3D).
         * [flann](https://github.com/flann-lib/flann)
     * Point cloud registration
         * ICP
+        * [Symmetric ICP](https://gfx.cs.princeton.edu/pubs/Rusinkiewicz_2019_ASO/symm_icp.pdf)(Implemented by @eclipse0922 )
         * [Colored Point Cloud Registration](https://ieeexplore.ieee.org/document/8237287)
         * [Fast Global Registration](http://vladlen.info/papers/fast-global-registration.pdf)
         * [FilterReg](https://arxiv.org/abs/1811.10136)

From 249e25c099c3430b467965254ab8a03b669f99cc Mon Sep 17 00:00:00 2001
From: neka-nat <nekanat.stock@gmail.com>
Date: Fri, 8 Dec 2023 22:55:45 +0900
Subject: [PATCH 20/23] add link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 32e828d2..5af2fc18 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ This repository is based on [Open3D](https://github.com/intel-isl/Open3D).
         * [flann](https://github.com/flann-lib/flann)
     * Point cloud registration
         * ICP
-        * [Symmetric ICP](https://gfx.cs.princeton.edu/pubs/Rusinkiewicz_2019_ASO/symm_icp.pdf)(Implemented by @eclipse0922 )
+        * [Symmetric ICP](https://gfx.cs.princeton.edu/pubs/Rusinkiewicz_2019_ASO/symm_icp.pdf) (Implemented by [@eclipse0922](https://github.com/eclipse0922))
         * [Colored Point Cloud Registration](https://ieeexplore.ieee.org/document/8237287)
         * [Fast Global Registration](http://vladlen.info/papers/fast-global-registration.pdf)
         * [FilterReg](https://arxiv.org/abs/1811.10136)

From 56029f79a0f0b794e1c4908a38fe51de0645702e Mon Sep 17 00:00:00 2001
From: neka-nat <nekanat.stock@gmail.com>
Date: Fri, 8 Dec 2023 23:00:23 +0900
Subject: [PATCH 21/23] v0.2.10.0

---
 src/cupoch/version.txt    | 2 +-
 src/python/pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cupoch/version.txt b/src/cupoch/version.txt
index 7d84b288..d570c031 100644
--- a/src/cupoch/version.txt
+++ b/src/cupoch/version.txt
@@ -1,4 +1,4 @@
 CUPOCH_VERSION_MAJOR 0
 CUPOCH_VERSION_MINOR 2
-CUPOCH_VERSION_PATCH 9
+CUPOCH_VERSION_PATCH 10
 CUPOCH_VERSION_TWEAK 0
diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml
index 96581aee..09a21b43 100644
--- a/src/python/pyproject.toml
+++ b/src/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cupoch"
-version = "0.2.9.0"
+version = "0.2.10.0"
 description = ""
 authors = ["nekanat <nekanat.stock@gmail.com>"]
 license = "MIT"

From ca7c29b8fde20ceba3b4eaa5e0affb4bdbf66935 Mon Sep 17 00:00:00 2001
From: neka-nat <nekanat.stock@gmail.com>
Date: Sat, 9 Dec 2023 13:33:20 +0900
Subject: [PATCH 22/23] fix publish package

---
 .github/workflows/ubuntu.yml  | 5 ++---
 .github/workflows/windows.yml | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
index b6cbeedb..9522c4ed 100644
--- a/.github/workflows/ubuntu.yml
+++ b/.github/workflows/ubuntu.yml
@@ -50,9 +50,8 @@ jobs:
     - name: Publish package
       if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
       env:
-        PYPI_USERNAME: ${{ secrets.pypi_username }}
-        PYPI_PASSWORD: ${{ secrets.pypi_password }}
+        PYPI_API_TOKEN: ${{ secrets.pypi_api_token }}
       run: |
         python -m pip install twine
         cd build/lib/python_package
-        python -m twine upload -u ${PYPI_USERNAME} -p ${PYPI_PASSWORD} pip_package/*
\ No newline at end of file
+        python -m twine upload -u "__token__" -p ${PYPI_API_TOKEN} pip_package/*
\ No newline at end of file
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 6afe6ad0..6ddd6da4 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -94,8 +94,7 @@ jobs:
       if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') && matrix.cuda == '11.2.0'
       working-directory: ${{ env.build_dir }}/lib/python_package
       env:
-        PYPI_USERNAME: ${{ secrets.pypi_username }}
-        PYPI_PASSWORD: ${{ secrets.pypi_password }}
+        PYPI_API_TOKEN: ${{ secrets.pypi_api_token }}
       run: |
         python -m pip install twine
-        python -m twine upload -u $env:PYPI_USERNAME -p $env:PYPI_PASSWORD pip_package/*
\ No newline at end of file
+        python -m twine upload -u "__token__" -p $env:PYPI_API_TOKEN pip_package/*
\ No newline at end of file

From 303d8d41b0768e077dc3dd051fa50fad084030a4 Mon Sep 17 00:00:00 2001
From: "sewon.jeon" <sewon.jeon@connecteve.com>
Date: Sun, 10 Dec 2023 11:36:50 +0900
Subject: [PATCH 23/23] Improve symmetric ICP

- more accurate computation
- normal check
- renaming
---
 .../registration/transformation_estimation.cu | 60 +++++++++++++++----
 1 file changed, 48 insertions(+), 12 deletions(-)

diff --git a/src/cupoch/registration/transformation_estimation.cu b/src/cupoch/registration/transformation_estimation.cu
index a50934c3..e8602d2b 100644
--- a/src/cupoch/registration/transformation_estimation.cu
+++ b/src/cupoch/registration/transformation_estimation.cu
@@ -55,9 +55,9 @@ struct pt2pl_jacobian_residual_functor
     }
 };
 
-struct pl2pl_jacobian_residual_functor
+struct symmetric_jacobian_residual_functor
     : public utility::jacobian_residual_functor<Eigen::Vector6f> {
-    pl2pl_jacobian_residual_functor(const Eigen::Vector3f *source,
+    symmetric_jacobian_residual_functor(const Eigen::Vector3f *source,
                                     const Eigen::Vector3f *source_normals,
                                     const Eigen::Vector3f *target_points,
                                     const Eigen::Vector3f *target_normals,
@@ -95,10 +95,13 @@ __device__ float ComputeErrorUsingNormals(
         const Eigen::Vector3f &source_normal,
         const Eigen::Vector3f &target_point,
         const Eigen::Vector3f &target_normal) {
-    // Implement the error calculation logic here
-    // This is just a placeholder logic
-    return (source_point - target_point).dot(target_normal) +
-           (source_point - target_point).dot(source_normal);
+    // Symmetric treatment of normals
+    Eigen::Vector3f combined_normal = source_normal + target_normal;
+
+    // Compute the symmetric point-to-plane error
+    float error = (source_point - target_point).dot(combined_normal);
+
+    return error * error;  // Return squared error for consistency
 }
 
 }  // namespace
@@ -287,13 +290,13 @@ Eigen::Matrix4f TransformationEstimationSymmetricMethod::ComputeTransformation(
         const geometry::PointCloud &source,
         const geometry::PointCloud &target,
         const CorrespondenceSet &corres) const {
-    if (corres.empty() || !target.HasNormals())
+    if (corres.empty() || !source.HasNormals() || !target.HasNormals())
         return Eigen::Matrix4f::Identity();
 
     Eigen::Matrix6f JTJ;
     Eigen::Vector6f JTr;
     float r2;
-    pl2pl_jacobian_residual_functor func(
+    symmetric_jacobian_residual_functor func(
             thrust::raw_pointer_cast(source.points_.data()),
             thrust::raw_pointer_cast(source.normals_.data()),
             thrust::raw_pointer_cast(target.points_.data()),
@@ -301,14 +304,47 @@ Eigen::Matrix4f TransformationEstimationSymmetricMethod::ComputeTransformation(
             thrust::raw_pointer_cast(corres.data()));
     thrust::tie(JTJ, JTr, r2) =
             utility::ComputeJTJandJTr<Eigen::Matrix6f, Eigen::Vector6f,
-                                      pl2pl_jacobian_residual_functor>(
+                                      symmetric_jacobian_residual_functor>(
                     func, (int)corres.size());
 
     bool is_success;
-    Eigen::Matrix4f extrinsic;
-    thrust::tie(is_success, extrinsic) =
+    Eigen::Matrix4f transformation_matrix;
+    thrust::tie(is_success, transformation_matrix) =
             utility::SolveJacobianSystemAndObtainExtrinsicMatrix(JTJ, JTr,
                                                                  det_thresh_);
+    if (is_success) {
+        // Extract the rotation matrix (3x3 upper-left block) from the 4x4
+        // transformation matrix. This matrix represents the rotation component
+        // of the transformation.
+        Eigen::Matrix3f rotation_matrix_3f =
+                transformation_matrix.template block<3, 3>(0, 0);
 
-    return is_success ? extrinsic : Eigen::Matrix4f::Identity();
+        // Convert the rotation matrix to double precision for higher accuracy.
+        // This step is essential to maintain accuracy in the subsequent
+        // squaring operation.
+        Eigen::Matrix3d rotation_matrix_3d = rotation_matrix_3f.cast<double>();
+
+        // Square the rotation matrix as described in the original Symmetric ICP
+        // paper. This approach, part of the rotated-normals version of the
+        // symmetric objective, optimizes for half of the rotation angle,
+        // reducing linearization error and enabling exact correspondences in
+        // certain cases.
+        Eigen::Matrix3d R = rotation_matrix_3d * rotation_matrix_3d;
+
+        // Construct the final transformation matrix using the squared rotation
+        // matrix (R) and the translation vector. The translation component is
+        // extracted directly from the original 4x4 transformation matrix.
+        Eigen::Matrix4f extrinsic = Eigen::Matrix4f::Identity();
+        extrinsic.template block<3, 3>(0, 0) =
+                R.cast<float>();  // Set the rotation part with the squared
+                                  // rotation matrix.
+        extrinsic.template block<3, 1>(0, 3) =
+                transformation_matrix.template block<3, 1>(
+                        0, 3);  // Set the translation part from the original
+                                // transformation matrix.
+
+        return extrinsic;
+    }
+
+    return Eigen::Matrix4f::Identity();
 }
\ No newline at end of file