diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
index 5cc117a6..f2ddcde0 100644
--- a/.github/workflows/ubuntu.yml
+++ b/.github/workflows/ubuntu.yml
@@ -12,9 +12,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version:
-#          - "3.8"
-          - "3.11"
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
 
     steps:
     - name: Checkout source code
@@ -71,9 +69,8 @@ jobs:
     - name: Publish package
       if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
       env:
-        PYPI_USERNAME: ${{ secrets.pypi_username }}
-        PYPI_PASSWORD: ${{ secrets.pypi_password }}
+        PYPI_API_TOKEN: ${{ secrets.pypi_api_token }}
       run: |
         python -m pip install twine
         cd build/lib/python_package
-        python -m twine upload -u ${PYPI_USERNAME} -p ${PYPI_PASSWORD} pip_package/*
\ No newline at end of file
+        python -m twine upload -u "__token__" -p ${PYPI_API_TOKEN} pip_package/*
\ No newline at end of file
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index a62dab25..78470c6f 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -100,8 +100,7 @@ jobs:
       if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
       working-directory: ${{ env.build_dir }}/lib/python_package
       env:
-        PYPI_USERNAME: ${{ secrets.pypi_username }}
-        PYPI_PASSWORD: ${{ secrets.pypi_password }}
+        PYPI_API_TOKEN: ${{ secrets.pypi_api_token }}
       run: |
         python -m pip install twine
-        python -m twine upload -u $env:PYPI_USERNAME -p $env:PYPI_PASSWORD pip_package/*
\ No newline at end of file
+        python -m twine upload -u "__token__" -p $env:PYPI_API_TOKEN pip_package/*
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 7660e991..97c0fc39 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 
-FROM nvidia/cuda:11.4.0-devel-ubuntu20.04
+FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
 
 WORKDIR /work/cupoch
 
@@ -39,4 +39,5 @@ RUN mkdir build \
 COPY . .
 RUN cd build \
     && cmake .. -DCMAKE_BUILD_TYPE=Release \
-    && make install-pip-package
+    && make pip-package \
+    && pip install lib/python_package/pip_package/*.whl
diff --git a/README.md b/README.md
index af930df7..bb0ab12b 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@ This repository is based on [Open3D](https://github.com/intel-isl/Open3D).
         * [flann](https://github.com/flann-lib/flann)
     * Point cloud registration
         * ICP
+        * [Symmetric ICP](https://gfx.cs.princeton.edu/pubs/Rusinkiewicz_2019_ASO/symm_icp.pdf) (Implemented by [@eclipse0922](https://github.com/eclipse0922))
         * [Colored Point Cloud Registration](https://ieeexplore.ieee.org/document/8237287)
         * [Fast Global Registration](http://vladlen.info/papers/fast-global-registration.pdf)
         * [FilterReg](https://arxiv.org/abs/1811.10136)
@@ -83,7 +84,7 @@ You can also install cupoch using pip on Jetson Nano.
 Please set up Jetson using [jetpack](https://developer.nvidia.com/embedded/jetpack) and install some packages with apt.
 
 ```
-sudo apt-get install libxinerama-dev libxcursor-dev libglu1-mesa-dev
+sudo apt-get install xorg-dev libxinerama-dev libxcursor-dev libglu1-mesa-dev
 pip3 install cupoch
 ```
 
@@ -107,27 +108,6 @@ sudo make install-pip-package
 
 ### Use Docker
 
-Setting default container runtime to nvidia-container-runtime.
-Edit or create the `/etc/docker/daemon.json`.
-
-```sh
-{
-    "runtimes": {
-        "nvidia": {
-            "path": "/usr/bin/nvidia-container-runtime",
-            "runtimeArgs": []
-         }
-    },
-    "default-runtime": "nvidia"
-}
-```
-
-Restart docker daemon.
-
-```sh
-sudo systemctl restart docker
-```
-
 ```sh
 docker-compose up -d
 # xhost +
@@ -147,11 +127,31 @@ The environment tested on has the following specs:
 
 You can get the result by running the example script in your environment.
 
-```
+```sh
 cd examples/python/basic
 python benchmarks.py
 ```
 
+If you get the following error when executing an example that includes 3D drawing, please start the program as follows.
+
+```sh
+$ cd examples/basic
+$ python visualization.py
+Load a ply point cloud, print it, and render it
+MESA: warning: Driver does not support the 0xa7a0 PCI ID.
+libGL error: failed to create dri screen
+libGL error: failed to load driver: iris
+MESA: warning: Driver does not support the 0xa7a0 PCI ID.
+libGL error: failed to create dri screen
+libGL error: failed to load driver: iris
+Error: unknown error	phong_shader.cu:330
+```
+
+```sh
+__NV_PRIME_RENDER_OFFLOAD=1 __GLX_VENDOR_LIBRARY_NAME=nvidia python visualization.py
+```
+
+
 ![speedup](https://raw.githubusercontent.com/neka-nat/cupoch/master/docs/_static/speedup.png)
 
 ### Visual odometry with intel realsense D435
diff --git a/conanfile.py b/conanfile.py
index a1e0497a..35ccb9cf 100644
--- a/conanfile.py
+++ b/conanfile.py
@@ -27,7 +27,7 @@
 
 class CupochConan(ConanFile):
     name = "cupoch"
-    version = "0.2.8.1"
+    version = "0.2.10.0"
     package_type = "library"
 
     license = "MIT"
diff --git a/src/cupoch/geometry/boundingvolume.cu b/src/cupoch/geometry/boundingvolume.cu
index e3622d61..f3f9a4ca 100644
--- a/src/cupoch/geometry/boundingvolume.cu
+++ b/src/cupoch/geometry/boundingvolume.cu
@@ -42,7 +42,7 @@ struct check_within_oriented_bounding_box_functor {
     const std::array<Eigen::Vector3f, 8> box_points_;
     __device__ float test_plane(const Eigen::Vector3f &a,
                                 const Eigen::Vector3f &b,
-                                const Eigen::Vector3f c,
+                                const Eigen::Vector3f &c,
                                 const Eigen::Vector3f &x) const {
         Eigen::Matrix3f design;
         design << (b - a), (c - a), (x - a);
@@ -51,7 +51,7 @@ struct check_within_oriented_bounding_box_functor {
     __device__ bool operator()(
             const thrust::tuple<int, Eigen::Vector3f> &x) const {
         const Eigen::Vector3f &point = thrust::get<1>(x);
-        return (test_plane(box_points_[0], box_points_[1], box_points_[3],
+        return ((test_plane(box_points_[0], box_points_[1], box_points_[3],
                            point) <= 0 &&
                 test_plane(box_points_[0], box_points_[5], box_points_[3],
                            point) >= 0 &&
@@ -62,7 +62,19 @@ struct check_within_oriented_bounding_box_functor {
                 test_plane(box_points_[3], box_points_[4], box_points_[5],
                            point) <= 0 &&
                 test_plane(box_points_[0], box_points_[1], box_points_[7],
-                           point) >= 0);
+                           point) >= 0) ||
+                (test_plane(box_points_[0], box_points_[1], box_points_[3],
+                           point) >= 0 &&
+                test_plane(box_points_[0], box_points_[5], box_points_[3],
+                           point) <= 0 &&
+                test_plane(box_points_[2], box_points_[5], box_points_[7],
+                           point) >= 0 &&
+                test_plane(box_points_[1], box_points_[4], box_points_[7],
+                           point) <= 0 &&
+                test_plane(box_points_[3], box_points_[4], box_points_[5],
+                           point) >= 0 &&
+                test_plane(box_points_[0], box_points_[1], box_points_[7],
+                           point) <= 0));
     }
 };
 
diff --git a/src/cupoch/geometry/laserscanbuffer.cu b/src/cupoch/geometry/laserscanbuffer.cu
index 8facabbc..4706f5f9 100644
--- a/src/cupoch/geometry/laserscanbuffer.cu
+++ b/src/cupoch/geometry/laserscanbuffer.cu
@@ -298,6 +298,11 @@ template LaserScanBuffer& LaserScanBuffer::AddRanges(
         const Eigen::Matrix4f& transformation,
         const utility::pinned_host_vector<float>& intensities);
 
+template LaserScanBuffer& LaserScanBuffer::AddRanges(
+        const thrust::host_vector<float>& ranges,
+        const Eigen::Matrix4f& transformation,
+        const thrust::host_vector<float>& intensities);
+
 
 class ContainerLikePtr {
 public:
diff --git a/src/cupoch/geometry/occupancygrid.cu b/src/cupoch/geometry/occupancygrid.cu
index e6f7fa1d..de19ee95 100644
--- a/src/cupoch/geometry/occupancygrid.cu
+++ b/src/cupoch/geometry/occupancygrid.cu
@@ -526,6 +526,17 @@ OccupancyGrid& OccupancyGrid::Insert(
     return Insert(dev_points, viewpoint, max_range);
 }
 
+OccupancyGrid& OccupancyGrid::Insert(
+        const thrust::host_vector<Eigen::Vector3f>& points,
+        const Eigen::Vector3f& viewpoint,
+        float max_range) {
+    utility::device_vector<Eigen::Vector3f> dev_points(points.size());
+    cudaSafeCall(cudaMemcpy(
+            thrust::raw_pointer_cast(dev_points.data()), thrust::raw_pointer_cast(points.data()),
+            points.size() * sizeof(Eigen::Vector3f), cudaMemcpyHostToDevice));
+    return Insert(dev_points, viewpoint, max_range);
+}
+
 OccupancyGrid& OccupancyGrid::Insert(const geometry::PointCloud& pointcloud,
                                      const Eigen::Vector3f& viewpoint,
                                      float max_range) {
diff --git a/src/cupoch/geometry/occupancygrid.h b/src/cupoch/geometry/occupancygrid.h
index c06cc001..303e8776 100644
--- a/src/cupoch/geometry/occupancygrid.h
+++ b/src/cupoch/geometry/occupancygrid.h
@@ -108,6 +108,10 @@ class OccupancyGrid : public DenseGrid<OccupancyVoxel> {
             const utility::pinned_host_vector<Eigen::Vector3f>& points,
             const Eigen::Vector3f& viewpoint,
             float max_range = -1.0);
+    OccupancyGrid& Insert(
+            const thrust::host_vector<Eigen::Vector3f>& points,
+            const Eigen::Vector3f& viewpoint,
+            float max_range = -1.0);
     OccupancyGrid& Insert(const PointCloud& pointcloud,
                           const Eigen::Vector3f& viewpoint,
                           float max_range = -1.0);
diff --git a/src/cupoch/integration/scalable_tsdfvolume.cu b/src/cupoch/integration/scalable_tsdfvolume.cu
index d8301c56..4180bd8d 100644
--- a/src/cupoch/integration/scalable_tsdfvolume.cu
+++ b/src/cupoch/integration/scalable_tsdfvolume.cu
@@ -125,8 +125,6 @@ __global__ void OpenVolumeUnitKernel(const Eigen::Vector3f *points,
 struct extract_pointcloud_functor {
     extract_pointcloud_functor(
             const VolumeUnitsMap &volume_units,
-            const stdgpu::device_indexed_range<const VolumeUnitsMap::value_type>
-                    &range,
             int resolution,
             float voxel_length,
             float volume_unit_length,
@@ -135,7 +133,7 @@ struct extract_pointcloud_functor {
             Eigen::Vector3f *normals,
             Eigen::Vector3f *colors)
         : volume_units_(volume_units),
-          range_(range),
+          range_(volume_units.device_range()),
           resolution_(resolution),
           voxel_length_(voxel_length),
           half_voxel_length_(0.5 * voxel_length_),
@@ -363,7 +361,7 @@ std::shared_ptr<geometry::PointCloud> ScalableTSDFVolume::ExtractPointCloud() {
     pointcloud->normals_.resize(3 * n_total, nanvec);
     pointcloud->colors_.resize(3 * n_total, nanvec);
     extract_pointcloud_functor func(
-            impl_->volume_units_, impl_->volume_units_.device_range(),
+            impl_->volume_units_,
             resolution_, voxel_length_, volume_unit_length_, color_type_,
             thrust::raw_pointer_cast(pointcloud->points_.data()),
             thrust::raw_pointer_cast(pointcloud->normals_.data()),
diff --git a/src/cupoch/knn/lbvh_knn.cu b/src/cupoch/knn/lbvh_knn.cu
index bd8cd852..4414bbbe 100644
--- a/src/cupoch/knn/lbvh_knn.cu
+++ b/src/cupoch/knn/lbvh_knn.cu
@@ -19,15 +19,41 @@
  * IN THE SOFTWARE.
  **/
 #include <Eigen/Core>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
 #include "cupoch/knn/lbvh_knn.h"
 
+#include <lbvh_index/aabb.cuh>
+#include <lbvh_index/lbvh.cuh>
 #include <lbvh_index/lbvh_kernels.cuh>
+#include <lbvh_index/query_knn_kernels.cuh>
+
+#include "cupoch/knn/kdtree_search_param.h"
 #include "cupoch/utility/eigen.h"
 #include "cupoch/utility/platform.h"
 #include "cupoch/utility/helper.h"
 
 
 namespace {
+
+template<int Dim>
+struct convert_float3_functor {
+    convert_float3_functor() {}
+    __device__
+    float3 operator() (const Eigen::Matrix<float, Dim, 1>& x) {
+        return make_float3(x[0], x[1], x[2]);
+    }
+};
+
+
+__device__ __host__
+lbvh::AABB to_float3_aabb(const cupoch::knn::AABB& aabb) {
+    lbvh::AABB aabb_f3;
+    aabb_f3.min = make_float3(aabb.first[0], aabb.first[1], aabb.first[2]);
+    aabb_f3.max = make_float3(aabb.second[0], aabb.second[1], aabb.second[2]);
+    return aabb_f3;
+}
+
 template<typename T>
 struct convert_float3_and_aabb_functor {
     convert_float3_and_aabb_functor() {}
@@ -47,54 +73,84 @@ namespace cupoch {
 namespace knn {
 
 LinearBoundingVolumeHierarchyKNN::LinearBoundingVolumeHierarchyKNN(size_t leaf_size, bool compact, bool sort_queries, bool shrink_to_fit)
-    : leaf_size_(leaf_size), compact_(compact), sort_queries_(sort_queries), shrink_to_fit_(shrink_to_fit) {}
+    : leaf_size_(leaf_size), compact_(compact), sort_queries_(sort_queries), shrink_to_fit_(shrink_to_fit) {
+    nodes_ = std::make_unique<utility::device_vector<lbvh::BVHNode>>();
+}
+
+LinearBoundingVolumeHierarchyKNN::LinearBoundingVolumeHierarchyKNN(const utility::device_vector<Eigen::Vector3f> &data,
+                                                                   size_t leaf_size, bool compact, bool sort_queries, bool shrink_to_fit)
+    : leaf_size_(leaf_size), compact_(compact), sort_queries_(sort_queries), shrink_to_fit_(shrink_to_fit) {
+    nodes_ = std::make_unique<utility::device_vector<lbvh::BVHNode>>();
+    SetRawData(data);
+}
+
+LinearBoundingVolumeHierarchyKNN::~LinearBoundingVolumeHierarchyKNN() {}
 
 template <typename T>
-int LinearBoundingVolumeHierarchyKNN::SearchKNN(const utility::device_vector<T> &query,
-                                                int knn,
-                                                utility::device_vector<unsigned int> &indices,
-                                                utility::device_vector<float> &distance2) const{
-    if (query.empty() || n_points_ <= 0 || n_nodes_ <= 0 || knn < 0)
+int LinearBoundingVolumeHierarchyKNN::SearchNN(const utility::device_vector<T> &query,
+                                               float radius,
+                                               utility::device_vector<unsigned int> &indices,
+                                               utility::device_vector<float> &distance2) const{
+    if (query.empty() || n_points_ <= 0 || n_nodes_ <= 0)
         return -1;
     T query0 = query[0];
     if (size_t(query0.size()) != dimension_) return -1;
-    return SearchKNN<typename utility::device_vector<T>::const_iterator,
-                     T::RowsAtCompileTime>(query.begin(), query.end(), knn,
-                                           indices, distance2);
+    return SearchNN<typename utility::device_vector<T>::const_iterator,
+                    T::RowsAtCompileTime>(query.begin(), query.end(), radius,
+                                          indices, distance2);
+}
+
+template <typename T>
+int LinearBoundingVolumeHierarchyKNN::SearchNN(const T &query,
+                           float radius,
+                           thrust::host_vector<unsigned int> &indices,
+                           thrust::host_vector<float> &distance2) const {
+    utility::device_vector<T> query_dv(1, query);
+    utility::device_vector<unsigned int> indices_dv;
+    utility::device_vector<float> distance2_dv;
+    auto result = SearchNN<T>(query_dv, radius, indices_dv, distance2_dv);
+    indices = indices_dv;
+    distance2 = distance2_dv;
+    return result;
 }
 
 template <typename T>
 bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T> &data) {
     n_points_ = data.size();
     n_nodes_ = n_points_ * 2 - 1;
+    dimension_ = T::SizeAtCompileTime;
     data_float3_.resize(n_points_);
+    dim3 block_dim, grid_dim;
+    std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(n_points_);
+
+    // init aabbs
     utility::device_vector<lbvh::AABB> aabbs(n_points_);
     thrust::transform(data.begin(), data.end(), make_tuple_begin(aabbs, data_float3_), convert_float3_and_aabb_functor<T>());
     T min_data = utility::ComputeMinBound<T::SizeAtCompileTime, typename T::Scalar>(data);
     T max_data = utility::ComputeMaxBound<T::SizeAtCompileTime, typename T::Scalar>(data);
-    extent_.min = make_float3(min_data[0], min_data[1], min_data[2]);
-    extent_.max = make_float3(max_data[0], max_data[1], max_data[2]);
+    extent_.first = Eigen::Vector3f(min_data[0], min_data[1], min_data[2]);
+    extent_.second = Eigen::Vector3f(max_data[0], max_data[1], max_data[2]);
+    auto extent_float3 = to_float3_aabb(extent_);
 
-    utility::device_vector<unsigned long long int> morton_codes(n_points_);
+    // compute the morton codes of the aabbs
+    utility::device_vector<lbvh::HashType> morton_codes(n_points_);
     thrust::transform(
         aabbs.begin(), aabbs.end(), morton_codes.begin(),
-        [this] __device__ (const lbvh::AABB& aabb) { return lbvh::morton_code(aabb, extent_); });
-    dim3 block_dim, grid_dim;
-    std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(n_points_);
-    compute_morton_points_kernel<<<grid_dim, block_dim>>>(
-        thrust::raw_pointer_cast(data_float3_.data()), extent_, thrust::raw_pointer_cast(morton_codes.data()), n_points_);
-    cudaSafeCall(cudaDeviceSynchronize());
+        [extent_float3] __device__ (const lbvh::AABB& aabb) { return lbvh::morton_code(aabb, extent_float3); });
+
+    // sort everything by the morton codes
     sorted_indices_.resize(morton_codes.size());
     thrust::sequence(sorted_indices_.begin(), sorted_indices_.end());
     thrust::sort_by_key(morton_codes.begin(), morton_codes.end(), make_tuple_begin(sorted_indices_, aabbs));
 
-    nodes_.resize(n_nodes_);
+    // allocate space for the nodes as a raw cuda array
+    nodes_->resize(n_nodes_);
     initialize_tree_kernel<<<grid_dim, block_dim>>>(
-        thrust::raw_pointer_cast(nodes_.data()), thrust::raw_pointer_cast(aabbs.data()), n_points_);
+        thrust::raw_pointer_cast(nodes_->data()), thrust::raw_pointer_cast(aabbs.data()), n_points_);
     cudaSafeCall(cudaDeviceSynchronize());
     thrust::device_vector<unsigned int> root_node_index(1, std::numeric_limits<unsigned int>::max());
     construct_tree_kernel<<<grid_dim, block_dim>>>(
-        thrust::raw_pointer_cast(nodes_.data()),
+        thrust::raw_pointer_cast(nodes_->data()),
         thrust::raw_pointer_cast(root_node_index.data()),
         thrust::raw_pointer_cast(morton_codes.data()), n_points_);
     cudaSafeCall(cudaDeviceSynchronize());
@@ -102,10 +158,11 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
     if (leaf_size_ > 1) {
         utility::device_vector<unsigned int> valid(n_nodes_, 1);
         optimize_tree_kernel<<<grid_dim, block_dim>>>(
-            thrust::raw_pointer_cast(nodes_.data()),
+            thrust::raw_pointer_cast(nodes_->data()),
             thrust::raw_pointer_cast(root_node_index.data()),
             thrust::raw_pointer_cast(valid.data()), leaf_size_, n_points_);
         cudaSafeCall(cudaDeviceSynchronize());
+        // compact the tree to increase bandwidth
         if (compact_) {
             utility::device_vector<unsigned int> valid_sums(n_nodes_ + 1, 0);
             thrust::inclusive_scan(valid.begin(), valid.end(), valid_sums.begin() + 1);
@@ -125,14 +182,13 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
             unsigned int first_moved = valid_sums[new_node_count];
             std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(n_nodes_);
             compact_tree_kernel<<<grid_dim, block_dim>>>(
-                thrust::raw_pointer_cast(nodes_.data()),
+                thrust::raw_pointer_cast(nodes_->data()),
                 thrust::raw_pointer_cast(root_node_index.data()),
                 thrust::raw_pointer_cast(valid_sums.data()),
                 thrust::raw_pointer_cast(free.data()),
                 first_moved, new_node_count, n_nodes_);
             if (shrink_to_fit_) {
-                utility::device_vector<lbvh::BVHNode> nodes_old(nodes_);
-                nodes_.resize(new_node_count);
+                nodes_->resize(new_node_count);
             }
             n_nodes_ = new_node_count;
         }
@@ -141,6 +197,63 @@ bool LinearBoundingVolumeHierarchyKNN::SetRawData(const utility::device_vector<T
     return true;
 }
 
+template <typename InputIterator, int Dim>
+int LinearBoundingVolumeHierarchyKNN::SearchNN(InputIterator first,
+                                               InputIterator last,
+                                               float radius,
+                                               utility::device_vector<unsigned int> &indices,
+                                               utility::device_vector<float> &distance2) const {
+    size_t num_query = thrust::distance(first, last);
+    auto extent_float3 = to_float3_aabb(extent_);
+    utility::device_vector<float3> data_float3(num_query);
+    thrust::transform(first, last, data_float3.begin(), convert_float3_functor<Dim>());
+
+    utility::device_vector<lbvh::HashType> morton_codes(num_query);
+    utility::device_vector<unsigned int> sorted_indices(num_query);
+    thrust::sequence(sorted_indices.begin(), sorted_indices.end());
+    if (sort_queries_) {
+        dim3 block_dim, grid_dim;
+        std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(num_query);
+        compute_morton_points_kernel<<<grid_dim, block_dim>>>(
+            thrust::raw_pointer_cast(data_float3.data()), extent_float3, thrust::raw_pointer_cast(morton_codes.data()), num_query);
+        cudaSafeCall(cudaDeviceSynchronize());
+        thrust::sort_by_key(morton_codes.begin(), morton_codes.end(), sorted_indices.begin());
+    }
+
+    dim3 block_dim, grid_dim;
+    std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(num_query);
+    indices.resize(num_query, std::numeric_limits<unsigned int>::max());
+    distance2.resize(num_query, std::numeric_limits<float>::max());
+    utility::device_vector<unsigned int> neighbors(num_query, 0);
+
+    query_knn_kernel<<<grid_dim, block_dim>>>(
+        thrust::raw_pointer_cast(nodes_->data()),
+        thrust::raw_pointer_cast(data_float3_.data()),
+        thrust::raw_pointer_cast(sorted_indices_.data()),
+        root_node_index_,
+        radius * radius,
+        thrust::raw_pointer_cast(data_float3.data()),
+        thrust::raw_pointer_cast(sorted_indices.data()),
+        num_query,
+        thrust::raw_pointer_cast(indices.data()),
+        thrust::raw_pointer_cast(distance2.data()),
+        thrust::raw_pointer_cast(neighbors.data()));
+    cudaSafeCall(cudaDeviceSynchronize());
+    return 1;
+}
+
+template int LinearBoundingVolumeHierarchyKNN::SearchNN<Eigen::Vector3f>(
+        const utility::device_vector<Eigen::Vector3f> &query,
+        float radius,
+        utility::device_vector<unsigned int> &indices,
+        utility::device_vector<float> &distance2) const;
+
+template int LinearBoundingVolumeHierarchyKNN::SearchNN<Eigen::Vector3f>(
+        const Eigen::Vector3f &query,
+        float radius,
+        thrust::host_vector<unsigned int> &indices,
+        thrust::host_vector<float> &distance2) const;
+
 template bool LinearBoundingVolumeHierarchyKNN::SetRawData<Eigen::Vector3f>(
         const utility::device_vector<Eigen::Vector3f> &data);
 
diff --git a/src/cupoch/knn/lbvh_knn.h b/src/cupoch/knn/lbvh_knn.h
index 32bed373..4f03739f 100644
--- a/src/cupoch/knn/lbvh_knn.h
+++ b/src/cupoch/knn/lbvh_knn.h
@@ -19,12 +19,14 @@
  * IN THE SOFTWARE.
  **/
 #pragma once
-#include "cupoch/knn/lbvh_knn.h"
-#include <lbvh_index/aabb.cuh>
-#include <lbvh_index/lbvh.cuh>
+#include <Eigen/Core>
 
 #include "cupoch/utility/device_vector.h"
 
+namespace lbvh {
+struct BVHNode;
+}
+
 namespace cupoch {
 
 namespace geometry {
@@ -33,27 +35,36 @@ class Geometry;
 
 namespace knn {
 
+using AABB = std::pair<Eigen::Vector3f, Eigen::Vector3f>;
+
+
 class LinearBoundingVolumeHierarchyKNN {
 public:
-    LinearBoundingVolumeHierarchyKNN(size_t leaf_size = 32, bool compact = true, bool sort_queries = true, bool shrink_to_fit = false);
-    LinearBoundingVolumeHierarchyKNN(const geometry::Geometry &geometry);
+    LinearBoundingVolumeHierarchyKNN(size_t leaf_size = 32, bool compact = false, bool sort_queries = false, bool shrink_to_fit = false);
+    LinearBoundingVolumeHierarchyKNN(const utility::device_vector<Eigen::Vector3f> &data, size_t leaf_size = 32, bool compact = false, bool sort_queries = false, bool shrink_to_fit = false);
     ~LinearBoundingVolumeHierarchyKNN();
     LinearBoundingVolumeHierarchyKNN(const LinearBoundingVolumeHierarchyKNN &) = delete;
     LinearBoundingVolumeHierarchyKNN &operator=(const LinearBoundingVolumeHierarchyKNN &) = delete;
 
 public:
     template <typename InputIterator, int Dim>
-    int SearchKNN(InputIterator first,
-                  InputIterator last,
-                  int knn,
-                  utility::device_vector<unsigned int> &indices,
-                  utility::device_vector<float> &distance2) const;
+    int SearchNN(InputIterator first,
+                 InputIterator last,
+                 float radius,
+                 utility::device_vector<unsigned int> &indices,
+                 utility::device_vector<float> &distance2) const;
+
+    template <typename T>
+    int SearchNN(const utility::device_vector<T> &query,
+                 float radius,
+                 utility::device_vector<unsigned int> &indices,
+                 utility::device_vector<float> &distance2) const;
 
     template <typename T>
-    int SearchKNN(const utility::device_vector<T> &query,
-                  int knn,
-                  utility::device_vector<unsigned int> &indices,
-                  utility::device_vector<float> &distance2) const;
+    int SearchNN(const T &query,
+                 float radius,
+                 thrust::host_vector<unsigned int> &indices,
+                 thrust::host_vector<float> &distance2) const;
 
     template <typename T>
     bool SetRawData(const utility::device_vector<T> &data);
@@ -67,8 +78,8 @@ class LinearBoundingVolumeHierarchyKNN {
     bool sort_queries_;
     bool shrink_to_fit_;
 
-    lbvh::AABB extent_;
-    utility::device_vector<lbvh::BVHNode> nodes_;
+    AABB extent_;
+    std::unique_ptr<utility::device_vector<lbvh::BVHNode>> nodes_;
     utility::device_vector<float3> data_float3_;
     utility::device_vector<unsigned int> sorted_indices_;
     unsigned int root_node_index_;
@@ -77,4 +88,3 @@ class LinearBoundingVolumeHierarchyKNN {
 }
 }
 
-#include "cupoch/knn/lbvh_knn.inl"
diff --git a/src/cupoch/knn/lbvh_knn.inl b/src/cupoch/knn/lbvh_knn.inl
deleted file mode 100644
index 76d37131..00000000
--- a/src/cupoch/knn/lbvh_knn.inl
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Copyright (c) 2022 Neka-Nat
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- **/
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include "cupoch/knn/lbvh_knn.h"
-#include <lbvh_index/lbvh_kernels.cuh>
-#include <lbvh_index/query_knn_kernels.cuh>
-
-#include "cupoch/utility/platform.h"
-#include "cupoch/utility/helper.h"
-
-namespace {
-
-template<int Dim>
-struct convert_float3_functor {
-    convert_float3_functor() {}
-    __device__
-    float3 operator() (const Eigen::Matrix<float, Dim, 1>& x) {
-        return make_float3(x[0], x[1], x[2]);
-    }
-};
-
-}
-namespace cupoch {
-namespace knn {
-
-template <typename InputIterator, int Dim>
-int LinearBoundingVolumeHierarchyKNN::SearchKNN(InputIterator first,
-                                                InputIterator last,
-                                                int knn,
-                                                utility::device_vector<unsigned int> &indices,
-                                                utility::device_vector<float> &distance2) const {
-    size_t num_query = thrust::distance(first, last);
-    utility::device_vector<float3> data_float3(num_query);
-    thrust::transform(first, last, data_float3.begin(), convert_float3_functor<Dim>());
-
-    utility::device_vector<unsigned long long int> morton_codes(num_query);
-    utility::device_vector<unsigned int> sorted_indices(num_query);
-    thrust::sequence(sorted_indices.begin(), sorted_indices.end());
-    if (sort_queries_) {
-        dim3 block_dim, grid_dim;
-        std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(num_query);
-        compute_morton_points_kernel<<<grid_dim, block_dim>>>(
-            thrust::raw_pointer_cast(data_float3.data()), extent_, thrust::raw_pointer_cast(morton_codes.data()), num_query);
-        cudaSafeCall(cudaDeviceSynchronize());
-        thrust::sort_by_key(morton_codes.begin(), morton_codes.end(), sorted_indices.begin());
-    }
-
-    dim3 block_dim, grid_dim;
-    std::tie(block_dim, grid_dim) = utility::SelectBlockGridSizes(num_query);
-    indices.resize(num_query * knn, std::numeric_limits<unsigned int>::max());
-    distance2.resize(num_query * knn, std::numeric_limits<float>::max());
-    utility::device_vector<unsigned int> neighbors(num_query, 0);
-
-    query_knn_kernel<<<grid_dim, block_dim>>>(
-        thrust::raw_pointer_cast(nodes_.data()),
-        thrust::raw_pointer_cast(data_float3_.data()),
-        thrust::raw_pointer_cast(sorted_indices_.data()),
-        root_node_index_,
-        std::numeric_limits<float>::max(),
-        thrust::raw_pointer_cast(data_float3.data()),
-        thrust::raw_pointer_cast(sorted_indices.data()),
-        num_query,
-        thrust::raw_pointer_cast(indices.data()),
-        thrust::raw_pointer_cast(distance2.data()),
-        thrust::raw_pointer_cast(neighbors.data()));
-    cudaSafeCall(cudaDeviceSynchronize());
-    return 1;
-}
-
-}
-}
\ No newline at end of file
diff --git a/src/cupoch/registration/kabsch.cu b/src/cupoch/registration/kabsch.cu
index c336bd53..ae806207 100644
--- a/src/cupoch/registration/kabsch.cu
+++ b/src/cupoch/registration/kabsch.cu
@@ -32,13 +32,21 @@
 using namespace cupoch;
 using namespace cupoch::registration;
 
+Eigen::Matrix4f_u cupoch::registration::Kabsch(const utility::device_vector<Eigen::Vector3f> &model,
+                         const utility::device_vector<Eigen::Vector3f> &target,
+                         const CorrespondenceSet &corres) {
+    return Kabsch(utility::GetStream(0), utility::GetStream(1), model, target,
+                  corres);
+}
+
 Eigen::Matrix4f_u cupoch::registration::Kabsch(
+        cudaStream_t stream1, cudaStream_t stream2,
         const utility::device_vector<Eigen::Vector3f> &model,
         const utility::device_vector<Eigen::Vector3f> &target,
         const CorrespondenceSet &corres) {
     // Compute the center
     auto res1 = thrust::async::reduce(
-            utility::exec_policy(utility::GetStream(0)),
+            utility::exec_policy(stream1),
             thrust::make_permutation_iterator(
                     model.begin(),
                     thrust::make_transform_iterator(
@@ -51,7 +59,7 @@ Eigen::Matrix4f_u cupoch::registration::Kabsch(
                             element_get_functor<Eigen::Vector2i, 0>())),
             Eigen::Vector3f(0.0, 0.0, 0.0), thrust::plus<Eigen::Vector3f>());
     auto res2 = thrust::async::reduce(
-            utility::exec_policy(utility::GetStream(1)),
+            utility::exec_policy(stream2),
             thrust::make_permutation_iterator(
                     target.begin(),
                     thrust::make_transform_iterator(
@@ -73,7 +81,7 @@ Eigen::Matrix4f_u cupoch::registration::Kabsch(
     // Compute the H matrix
     const Eigen::Matrix3f init = Eigen::Matrix3f::Zero();
     Eigen::Matrix3f hh = thrust::inner_product(
-            utility::exec_policy(0),
+            utility::exec_policy(stream1),
             thrust::make_permutation_iterator(
                     model.begin(),
                     thrust::make_transform_iterator(
@@ -111,14 +119,20 @@ Eigen::Matrix4f_u cupoch::registration::Kabsch(
     return tr;
 }
 
+Eigen::Matrix4f_u cupoch::registration::Kabsch(const utility::device_vector<Eigen::Vector3f> &model,
+                         const utility::device_vector<Eigen::Vector3f> &target) {
+    return Kabsch(utility::GetStream(0), utility::GetStream(1), model, target);
+}
+
 Eigen::Matrix4f_u cupoch::registration::Kabsch(
+        cudaStream_t stream1, cudaStream_t stream2,
         const utility::device_vector<Eigen::Vector3f> &model,
         const utility::device_vector<Eigen::Vector3f> &target) {
     CorrespondenceSet corres(model.size());
     thrust::tabulate(corres.begin(), corres.end(), [] __device__(size_t idx) {
         return Eigen::Vector2i(idx, idx);
     });
-    return Kabsch(model, target, corres);
+    return Kabsch(stream1, stream2, model, target, corres);
 }
 
 Eigen::Matrix4f_u cupoch::registration::KabschWeighted(
diff --git a/src/cupoch/registration/kabsch.h b/src/cupoch/registration/kabsch.h
index ecab45f9..4c4bfc10 100644
--- a/src/cupoch/registration/kabsch.h
+++ b/src/cupoch/registration/kabsch.h
@@ -30,9 +30,18 @@ namespace registration {
 Eigen::Matrix4f_u Kabsch(const utility::device_vector<Eigen::Vector3f> &model,
                          const utility::device_vector<Eigen::Vector3f> &target,
                          const CorrespondenceSet &corres);
+Eigen::Matrix4f_u Kabsch(cudaStream_t stream1,
+                         cudaStream_t stream2,
+                         const utility::device_vector<Eigen::Vector3f> &model,
+                         const utility::device_vector<Eigen::Vector3f> &target,
+                         const CorrespondenceSet &corres);
 
 Eigen::Matrix4f_u Kabsch(const utility::device_vector<Eigen::Vector3f> &model,
                          const utility::device_vector<Eigen::Vector3f> &target);
+Eigen::Matrix4f_u Kabsch(cudaStream_t stream1,
+                         cudaStream_t stream2,
+                         const utility::device_vector<Eigen::Vector3f> &model,
+                         const utility::device_vector<Eigen::Vector3f> &target);
 
 Eigen::Matrix4f_u KabschWeighted(
         const utility::device_vector<Eigen::Vector3f> &model,
diff --git a/src/cupoch/registration/transformation_estimation.cu b/src/cupoch/registration/transformation_estimation.cu
index daad683e..e8602d2b 100644
--- a/src/cupoch/registration/transformation_estimation.cu
+++ b/src/cupoch/registration/transformation_estimation.cu
@@ -55,6 +55,55 @@ struct pt2pl_jacobian_residual_functor
     }
 };
 
+struct symmetric_jacobian_residual_functor
+    : public utility::jacobian_residual_functor<Eigen::Vector6f> {
+    symmetric_jacobian_residual_functor(const Eigen::Vector3f *source,
+                                    const Eigen::Vector3f *source_normals,
+                                    const Eigen::Vector3f *target_points,
+                                    const Eigen::Vector3f *target_normals,
+                                    const Eigen::Vector2i *corres)
+        : source_(source),
+          source_normals_(source_normals),
+          target_points_(target_points),
+          target_normals_(target_normals),
+          corres_(corres){};
+    const Eigen::Vector3f *source_;
+    const Eigen::Vector3f *source_normals_;
+    const Eigen::Vector3f *target_points_;
+    const Eigen::Vector3f *target_normals_;
+    const Eigen::Vector2i *corres_;
+    __device__ void operator()(int idx, Eigen::Vector6f &vec, float &r) const {
+        const Eigen::Vector3f &vs = source_[corres_[idx][0]];
+        const Eigen::Vector3f &ns = source_normals_[corres_[idx][0]];
+        const Eigen::Vector3f &vt = target_points_[corres_[idx][1]];
+        const Eigen::Vector3f &nt = target_normals_[corres_[idx][1]];
+
+        Eigen::Vector3f n = ns + nt;  // sum of normals
+        Eigen::Vector3f d = vs - vt;  // difference of points
+
+        r = d.dot(n);  // symmetric residual
+
+        Eigen::Vector3f cross_product = (vs + vt).cross(n);
+        vec.block<3, 1>(0, 0) = cross_product;
+        vec.block<3, 1>(3, 0) = n;
+    }
+};
+
+// Define a function or a lambda to compute the error using normals
+__device__ float ComputeErrorUsingNormals(
+        const Eigen::Vector3f &source_point,
+        const Eigen::Vector3f &source_normal,
+        const Eigen::Vector3f &target_point,
+        const Eigen::Vector3f &target_normal) {
+    // Symmetric treatment of normals
+    Eigen::Vector3f combined_normal = source_normal + target_normal;
+
+    // Compute the symmetric point-to-plane error
+    float error = (source_point - target_point).dot(combined_normal);
+
+    return error * error;  // Return squared error for consistency
+}
+
 }  // namespace
 
 float TransformationEstimationPointToPoint::ComputeRMSE(
@@ -114,7 +163,8 @@ float TransformationEstimationPointToPlane::ComputeRMSE(
                             target.normals_.begin(),
                             thrust::make_transform_iterator(
                                     corres.begin(),
-                                    element_get_functor<Eigen::Vector2i, 1>()))),
+                                    element_get_functor<Eigen::Vector2i,
+                                                        1>()))),
             make_tuple_iterator(
                     thrust::make_permutation_iterator(
                             source.points_.begin(),
@@ -130,7 +180,8 @@ float TransformationEstimationPointToPlane::ComputeRMSE(
                             target.normals_.begin(),
                             thrust::make_transform_iterator(
                                     corres.end(),
-                                    element_get_functor<Eigen::Vector2i, 1>()))),
+                                    element_get_functor<Eigen::Vector2i,
+                                                        1>()))),
             [] __device__(const thrust::tuple<Eigen::Vector3f, Eigen::Vector3f,
                                               Eigen::Vector3f> &x) {
                 float r = (thrust::get<0>(x) - thrust::get<1>(x))
@@ -168,4 +219,132 @@ Eigen::Matrix4f TransformationEstimationPointToPlane::ComputeTransformation(
                                                                  det_thresh_);
 
     return is_success ? extrinsic : Eigen::Matrix4f::Identity();
+}
+
+float TransformationEstimationSymmetricMethod::ComputeRMSE(
+        const geometry::PointCloud &source,
+        const geometry::PointCloud &target,
+        const CorrespondenceSet &corres) const {
+    if (corres.empty() || !source.HasNormals() || !target.HasNormals())
+        return 0.0;
+    const float err = thrust::transform_reduce(
+            utility::exec_policy(0),
+            make_tuple_iterator(
+                    thrust::make_permutation_iterator(
+                            source.points_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.begin(),
+                                    element_get_functor<Eigen::Vector2i, 0>())),
+                    thrust::make_permutation_iterator(
+                            source.normals_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.begin(),
+                                    element_get_functor<Eigen::Vector2i, 0>())),
+                    thrust::make_permutation_iterator(
+                            target.points_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.begin(),
+                                    element_get_functor<Eigen::Vector2i, 1>())),
+                    thrust::make_permutation_iterator(
+                            target.normals_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.begin(),
+                                    element_get_functor<Eigen::Vector2i,
+                                                        1>()))),
+            make_tuple_iterator(
+                    thrust::make_permutation_iterator(
+                            source.points_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.end(),
+                                    element_get_functor<Eigen::Vector2i, 0>())),
+                    thrust::make_permutation_iterator(
+                            source.normals_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.end(),
+                                    element_get_functor<Eigen::Vector2i, 0>())),
+                    thrust::make_permutation_iterator(
+                            target.points_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.end(),
+                                    element_get_functor<Eigen::Vector2i, 1>())),
+                    thrust::make_permutation_iterator(
+                            target.normals_.begin(),
+                            thrust::make_transform_iterator(
+                                    corres.end(),
+                                    element_get_functor<Eigen::Vector2i,
+                                                        1>()))),
+            [] __device__(
+                    const thrust::tuple<Eigen::Vector3f, Eigen::Vector3f,
+                                        Eigen::Vector3f, Eigen::Vector3f> &x) {
+                // Compute error using both source and target normals
+                float r = ComputeErrorUsingNormals(
+                        thrust::get<0>(x), thrust::get<1>(x), thrust::get<2>(x),
+                        thrust::get<3>(x));
+                return r * r;
+            },
+            0.0f, thrust::plus<float>());
+    return std::sqrt(err / (float)corres.size());
+}
+
+Eigen::Matrix4f TransformationEstimationSymmetricMethod::ComputeTransformation(
+        const geometry::PointCloud &source,
+        const geometry::PointCloud &target,
+        const CorrespondenceSet &corres) const {
+    if (corres.empty() || !source.HasNormals() || !target.HasNormals())
+        return Eigen::Matrix4f::Identity();
+
+    Eigen::Matrix6f JTJ;
+    Eigen::Vector6f JTr;
+    float r2;
+    symmetric_jacobian_residual_functor func(
+            thrust::raw_pointer_cast(source.points_.data()),
+            thrust::raw_pointer_cast(source.normals_.data()),
+            thrust::raw_pointer_cast(target.points_.data()),
+            thrust::raw_pointer_cast(target.normals_.data()),
+            thrust::raw_pointer_cast(corres.data()));
+    thrust::tie(JTJ, JTr, r2) =
+            utility::ComputeJTJandJTr<Eigen::Matrix6f, Eigen::Vector6f,
+                                      symmetric_jacobian_residual_functor>(
+                    func, (int)corres.size());
+
+    bool is_success;
+    Eigen::Matrix4f transformation_matrix;
+    thrust::tie(is_success, transformation_matrix) =
+            utility::SolveJacobianSystemAndObtainExtrinsicMatrix(JTJ, JTr,
+                                                                 det_thresh_);
+    if (is_success) {
+        // Extract the rotation matrix (3x3 upper-left block) from the 4x4
+        // transformation matrix. This matrix represents the rotation component
+        // of the transformation.
+        Eigen::Matrix3f rotation_matrix_3f =
+                transformation_matrix.template block<3, 3>(0, 0);
+
+        // Convert the rotation matrix to double precision for higher accuracy.
+        // This step is essential to maintain accuracy in the subsequent
+        // squaring operation.
+        Eigen::Matrix3d rotation_matrix_3d = rotation_matrix_3f.cast<double>();
+
+        // Square the rotation matrix as described in the original Symmetric ICP
+        // paper. This approach, part of the rotated-normals version of the
+        // symmetric objective, optimizes for half of the rotation angle,
+        // reducing linearization error and enabling exact correspondences in
+        // certain cases.
+        Eigen::Matrix3d R = rotation_matrix_3d * rotation_matrix_3d;
+
+        // Construct the final transformation matrix using the squared rotation
+        // matrix (R) and the translation vector. The translation component is
+        // extracted directly from the original 4x4 transformation matrix.
+        Eigen::Matrix4f extrinsic = Eigen::Matrix4f::Identity();
+        extrinsic.template block<3, 3>(0, 0) =
+                R.cast<float>();  // Set the rotation part with the squared
+                                  // rotation matrix.
+        extrinsic.template block<3, 1>(0, 3) =
+                transformation_matrix.template block<3, 1>(
+                        0, 3);  // Set the translation part from the original
+                                // transformation matrix.
+
+        return extrinsic;
+    }
+
+    return Eigen::Matrix4f::Identity();
 }
\ No newline at end of file
diff --git a/src/cupoch/registration/transformation_estimation.h b/src/cupoch/registration/transformation_estimation.h
index a0aa262b..fe344abd 100644
--- a/src/cupoch/registration/transformation_estimation.h
+++ b/src/cupoch/registration/transformation_estimation.h
@@ -39,7 +39,8 @@ enum class TransformationEstimationType {
     Unspecified = 0,
     PointToPoint = 1,
     PointToPlane = 2,
-    ColoredICP = 3,
+    SymmetricMethod = 3,
+    ColoredICP = 4,
 };
 
 /// Base class that estimates a transformation between two point clouds
@@ -113,5 +114,32 @@ class TransformationEstimationPointToPlane : public TransformationEstimation {
             TransformationEstimationType::PointToPlane;
 };
 
+/// Estimate a transformation for plane to plane distance
+class TransformationEstimationSymmetricMethod : public TransformationEstimation {
+public:
+    TransformationEstimationSymmetricMethod(float det_thresh = 1.0e-6)
+        : det_thresh_(det_thresh) {}
+    ~TransformationEstimationSymmetricMethod() override {}
+
+public:
+    TransformationEstimationType GetTransformationEstimationType()
+            const override {
+        return type_;
+    };
+    float ComputeRMSE(const geometry::PointCloud &source,
+                      const geometry::PointCloud &target,
+                      const CorrespondenceSet &corres) const override;
+    Eigen::Matrix4f ComputeTransformation(
+            const geometry::PointCloud &source,
+            const geometry::PointCloud &target,
+            const CorrespondenceSet &corres) const override;
+
+    float det_thresh_;
+
+private:
+    const TransformationEstimationType type_ =
+            TransformationEstimationType::SymmetricMethod;
+};
+
 }  // namespace registration
 }  // namespace cupoch
\ No newline at end of file
diff --git a/src/cupoch/utility/device_vector.h b/src/cupoch/utility/device_vector.h
index cdcaf825..1d3e117d 100644
--- a/src/cupoch/utility/device_vector.h
+++ b/src/cupoch/utility/device_vector.h
@@ -34,7 +34,8 @@
 #include <thrust/device_vector.h>
 #endif
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
+#include <cupoch/utility/pinned_allocator.h>
+
 
 #if defined(_WIN32)
 struct float4_t {
diff --git a/src/cupoch/utility/eigen.h b/src/cupoch/utility/eigen.h
index 9bdd2e58..2e7f2766 100644
--- a/src/cupoch/utility/eigen.h
+++ b/src/cupoch/utility/eigen.h
@@ -86,6 +86,11 @@ template <typename MatType, typename VecType, typename FuncType>
 thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(const FuncType &f,
                                                         int iteration_num,
                                                         bool verbose = true);
+template <typename MatType, typename VecType, typename FuncType>
+thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(cudaStream_t stream,
+                                                        const FuncType &f,
+                                                        int iteration_num,
+                                                        bool verbose = true);
 
 /// Function to compute JTJ and Jtr
 /// Input: function pointer f and total number of rows of Jacobian matrix
@@ -96,6 +101,11 @@ template <typename MatType, typename VecType, int NumJ, typename FuncType>
 thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(const FuncType &f,
                                                         int iteration_num,
                                                         bool verbose = true);
+template <typename MatType, typename VecType, int NumJ, typename FuncType>
+thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(cudaStream_t stream,
+                                                        FuncType &f,
+                                                        int iteration_num,
+                                                        bool verbose = true);
 
 template <typename MatType,
           typename VecType,
diff --git a/src/cupoch/utility/eigen.inl b/src/cupoch/utility/eigen.inl
index dc650034..ea7016a9 100644
--- a/src/cupoch/utility/eigen.inl
+++ b/src/cupoch/utility/eigen.inl
@@ -80,10 +80,20 @@ struct wrapped_calc_weights_functor {
 
 }  // namespace
 
+
 template <typename MatType, typename VecType, typename FuncType>
 thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(const FuncType &f,
                                                         int iteration_num,
                                                         bool verbose) {
+    return ComputeJTJandJTr<MatType, VecType, FuncType>(
+            0, f, iteration_num, verbose);
+}
+
+template <typename MatType, typename VecType, typename FuncType>
+thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(cudaStream_t stream,
+                                                        const FuncType &f,
+                                                        int iteration_num,
+                                                        bool verbose) {
     MatType JTJ;
     VecType JTr;
     float r2_sum = 0.0;
@@ -91,6 +101,7 @@ thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(const FuncType &f,
     JTr.setZero();
     jtj_jtr_functor<MatType, VecType, FuncType> func(f);
     auto jtj_jtr_r2 = thrust::transform_reduce(
+            utility::exec_policy(stream),
             thrust::make_counting_iterator(0),
             thrust::make_counting_iterator(iteration_num), func,
             thrust::make_tuple(JTJ, JTr, r2_sum),
@@ -106,6 +117,13 @@ thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(const FuncType &f,
 template <typename MatType, typename VecType, int NumJ, typename FuncType>
 thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(
         const FuncType &f, int iteration_num, bool verbose /*=true*/) {
+    return ComputeJTJandJTr<MatType, VecType, NumJ, FuncType>(
+            0, f, iteration_num, verbose);
+}
+
+template <typename MatType, typename VecType, int NumJ, typename FuncType>
+thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(
+        cudaStream_t stream, const FuncType &f, int iteration_num, bool verbose /*=true*/) {
     MatType JTJ;
     VecType JTr;
     float r2_sum = 0.0;
@@ -113,6 +131,7 @@ thrust::tuple<MatType, VecType, float> ComputeJTJandJTr(
     JTr.setZero();
     multiple_jtj_jtr_functor<MatType, VecType, NumJ, FuncType> func(f);
     auto jtj_jtr_r2 = thrust::transform_reduce(
+            utility::exec_policy(stream),
             thrust::make_counting_iterator(0),
             thrust::make_counting_iterator(iteration_num), func,
             thrust::make_tuple(JTJ, JTr, r2_sum),
diff --git a/src/cupoch/utility/pinned_allocator.h b/src/cupoch/utility/pinned_allocator.h
new file mode 100644
index 00000000..c126e048
--- /dev/null
+++ b/src/cupoch/utility/pinned_allocator.h
@@ -0,0 +1,252 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cuda/experimental/pinned_allocator.h
+ *  \brief An allocator which creates new elements in "pinned" memory with \p cudaMallocHost
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <stdexcept>
+#include <limits>
+#include <string>
+#include <thrust/system/system_error.h>
+#include <thrust/system/cuda/error.h>
+
+
+#ifndef THRUST_NAMESPACE_BEGIN
+#define THRUST_NAMESPACE_BEGIN namespace thrust {
+#endif // THRUST_NAMESPACE_BEGIN
+
+#ifndef THRUST_NAMESPACE_END
+#define THRUST_NAMESPACE_END }
+#endif // THRUST_NAMESPACE_END
+
+
+THRUST_NAMESPACE_BEGIN
+
+namespace system
+{
+
+namespace cuda
+{
+
+namespace experimental
+{
+
+/*! \addtogroup memory_management_classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p pinned_allocator is a CUDA-specific host memory allocator
+ *  that employs \c cudaMallocHost for allocation.
+ *
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
+ */
+template<typename T> class pinned_allocator;
+
+template<>
+  class pinned_allocator<void>
+{
+  public:
+    typedef void           value_type;
+    typedef void       *   pointer;
+    typedef const void *   const_pointer;
+    typedef std::size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+
+    // convert a pinned_allocator<void> to pinned_allocator<U>
+    template<typename U>
+      struct rebind
+    {
+      typedef pinned_allocator<U> other;
+    }; // end rebind
+}; // end pinned_allocator
+
+
+template<typename T>
+  class pinned_allocator
+{
+  public:
+    //! \{
+    typedef T              value_type;
+    typedef T*             pointer;
+    typedef const T*       const_pointer;
+    typedef T&             reference;
+    typedef const T&       const_reference;
+    typedef std::size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+    //! \}
+
+    // convert a pinned_allocator<T> to pinned_allocator<U>
+    template<typename U>
+      struct rebind
+    {
+      typedef pinned_allocator<U> other;
+    }; // end rebind
+
+    /*! \p pinned_allocator's null constructor does nothing.
+     */
+    __host__ __device__
+    inline pinned_allocator() {}
+
+    /*! \p pinned_allocator's null destructor does nothing.
+     */
+    __host__ __device__
+    inline ~pinned_allocator() {}
+
+    /*! \p pinned_allocator's copy constructor does nothing.
+     */
+    __host__ __device__
+    inline pinned_allocator(pinned_allocator const &) {}
+
+    /*! This version of \p pinned_allocator's copy constructor
+     *  is templated on the \c value_type of the \p pinned_allocator
+     *  to copy from.  It is provided merely for convenience; it
+     *  does nothing.
+     */
+    template<typename U>
+    __host__ __device__
+    inline pinned_allocator(pinned_allocator<U> const &) {}
+
+    /*! This method returns the address of a \c reference of
+     *  interest.
+     *
+     *  \p r The \c reference of interest.
+     *  \return \c r's address.
+     */
+    __host__ __device__
+    inline pointer address(reference r) { return &r; }
+
+    /*! This method returns the address of a \c const_reference
+     *  of interest.
+     *
+     *  \p r The \c const_reference of interest.
+     *  \return \c r's address.
+     */
+    __host__ __device__
+    inline const_pointer address(const_reference r) { return &r; }
+
+    /*! This method allocates storage for objects in pinned host
+     *  memory.
+     *
+     *  \p cnt The number of objects to allocate.
+     *  \return a \c pointer to the newly allocated objects.
+     *  \note This method does not invoke \p value_type's constructor.
+     *        It is the responsibility of the caller to initialize the
+     *        objects at the returned \c pointer.
+     */
+    __host__
+    inline pointer allocate(size_type cnt,
+                            const_pointer = 0)
+    {
+      if(cnt > this->max_size())
+      {
+        throw std::bad_alloc();
+      } // end if
+
+      pointer result(0);
+      cudaError_t error = cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type));
+
+      if(error)
+      {
+        cudaGetLastError(); // Clear global CUDA error state.
+        throw std::bad_alloc();
+      } // end if
+
+      return result;
+    } // end allocate()
+
+    /*! This method deallocates pinned host memory previously allocated
+     *  with this \c pinned_allocator.
+     *
+     *  \p p A \c pointer to the previously allocated memory.
+     *  \p cnt The number of objects previously allocated at
+     *         \p p.
+     *  \note This method does not invoke \p value_type's destructor.
+     *        It is the responsibility of the caller to destroy
+     *        the objects stored at \p p.
+     */
+    __host__
+    inline void deallocate(pointer p, size_type /*cnt*/)
+    {
+      cudaError_t error = cudaFreeHost(p);
+
+      cudaGetLastError(); // Clear global CUDA error state.
+
+      if(error)
+      {
+        cudaGetLastError(); // Clear global CUDA error state.
+        throw thrust::system_error(error, thrust::cuda_category());
+      } // end if
+    } // end deallocate()
+
+    /*! This method returns the maximum size of the \c cnt parameter
+     *  accepted by the \p allocate() method.
+     *
+     *  \return The maximum number of objects that may be allocated
+     *          by a single call to \p allocate().
+     */
+    inline size_type max_size() const
+    {
+      return (std::numeric_limits<size_type>::max)() / sizeof(T);
+    } // end max_size()
+
+    /*! This method tests this \p pinned_allocator for equality to
+     *  another.
+     *
+     *  \param x The other \p pinned_allocator of interest.
+     *  \return This method always returns \c true.
+     */
+    __host__ __device__
+    inline bool operator==(pinned_allocator const& x) const { return true; }
+
+    /*! This method tests this \p pinned_allocator for inequality
+     *  to another.
+     *
+     *  \param x The other \p pinned_allocator of interest.
+     *  \return This method always returns \c false.
+     */
+    __host__ __device__
+    inline bool operator!=(pinned_allocator const &x) const { return !operator==(x); }
+}; // end pinned_allocator
+
+/*! \}
+ */
+
+} // end experimental
+
+} // end cuda
+
+} // end system
+
+// alias cuda's members at top-level
+namespace cuda
+{
+
+namespace experimental
+{
+
+using thrust::system::cuda::experimental::pinned_allocator;
+
+} // end experimental
+
+} // end cuda
+
+THRUST_NAMESPACE_END
diff --git a/src/cupoch/utility/platform.cu b/src/cupoch/utility/platform.cu
index 73736cfd..5a56fed1 100644
--- a/src/cupoch/utility/platform.cu
+++ b/src/cupoch/utility/platform.cu
@@ -70,7 +70,7 @@ std::tuple<dim3, dim3> SelectBlockGridSizes(int data_size, int threads_per_block
     cudaDeviceProp prop;
     cupoch::utility::GetDeviceProp(prop);
     int max_threads_per_block = prop.maxThreadsPerBlock;
-    if (threads_per_block < 0) {
+    if (threads_per_block > 0) {
         if (threads_per_block > max_threads_per_block) {
             throw std::runtime_error("Threads per block exceeds device maximum.");
         } else {
diff --git a/src/cupoch/version.txt b/src/cupoch/version.txt
index 0637c5c7..d570c031 100644
--- a/src/cupoch/version.txt
+++ b/src/cupoch/version.txt
@@ -1,4 +1,4 @@
 CUPOCH_VERSION_MAJOR 0
 CUPOCH_VERSION_MINOR 2
-CUPOCH_VERSION_PATCH 8
-CUPOCH_VERSION_TWEAK 1
+CUPOCH_VERSION_PATCH 10
+CUPOCH_VERSION_TWEAK 0
diff --git a/src/python/cupoch_pybind/registration/registration.cpp b/src/python/cupoch_pybind/registration/registration.cpp
index 2990b2b2..f00ac38d 100644
--- a/src/python/cupoch_pybind/registration/registration.cpp
+++ b/src/python/cupoch_pybind/registration/registration.cpp
@@ -119,9 +119,10 @@ void pybind_registration_classes(py::module &m) {
 
     py::enum_<registration::TransformationEstimationType> tf_type(m, "TransformationEstimationType");
     tf_type.value("PointToPoint", registration::TransformationEstimationType::PointToPoint)
-           .value("PointToPlane", registration::TransformationEstimationType::PointToPlane)
-           .value("ColoredICP", registration::TransformationEstimationType::ColoredICP)
-           .export_values();
+            .value("PointToPlane", registration::TransformationEstimationType::PointToPlane)
+            .value("SymmetricMethod", registration::TransformationEstimationType::SymmetricMethod)
+            .value("ColoredICP", registration::TransformationEstimationType::ColoredICP)
+            .export_values();
 
     // cupoch.registration.TransformationEstimationPointToPoint:
     // TransformationEstimation
@@ -164,6 +165,25 @@ void pybind_registration_classes(py::module &m) {
                 return std::string("TransformationEstimationPointToPlane");
             });
 
+    // cupoch.registration.TransformationEstimationSymmetricMethod:
+    // TransformationEstimation
+    py::class_<registration::TransformationEstimationSymmetricMethod,
+               PyTransformationEstimation<
+                       registration::TransformationEstimationSymmetricMethod>,
+               registration::TransformationEstimation>
+            te_l2l(m, "TransformationEstimationSymmetricMethod",
+                   "Class to estimate a transformation for plane to plane "
+                   "distance.");
+    py::detail::bind_default_constructor<
+            registration::TransformationEstimationSymmetricMethod>(te_l2l);
+    py::detail::bind_copy_functions<
+            registration::TransformationEstimationSymmetricMethod>(te_l2l);
+    te_l2l.def(
+            "__repr__",
+            [](const registration::TransformationEstimationSymmetricMethod &te) {
+                return std::string("TransformationEstimationSymmetricMethod");
+            });
+
     // cupoch.registration.FastGlobalRegistrationOption:
     py::class_<registration::FastGlobalRegistrationOption> fgr_option(
             m, "FastGlobalRegistrationOption",
@@ -351,8 +371,9 @@ static const std::unordered_map<std::string, std::string>
                 {"criteria", "Convergence criteria"},
                 {"estimation_method",
                  "Estimation method. One of "
-                 "(``registration::TransformationEstimationPointToPoint``, "
-                 "``registration::TransformationEstimationPointToPlane``)"},
+                 "(``registration::TransformationEstimationPointToPoint``,"
+                 "``registration::TransformationEstimationPointToPlane``,"
+                 "``registration::TransformationEstimationSymmetricMethod``)"},
                 {"init", "Initial transformation estimation"},
                 {"lambda_geometric", "lambda_geometric value"},
                 {"max_correspondence_distance",
diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml
index 4198b363..09a21b43 100644
--- a/src/python/pyproject.toml
+++ b/src/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cupoch"
-version = "0.2.8.1"
+version = "0.2.10.0"
 description = ""
 authors = ["nekanat <nekanat.stock@gmail.com>"]
 license = "MIT"
diff --git a/src/python/setup.py b/src/python/setup.py
index b59f1f9b..d200bfae 100644
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -54,6 +54,7 @@ def find_stubs(package):
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
         "Topic :: Education",
         "Topic :: Multimedia :: Graphics :: 3D Rendering",
         "Topic :: Multimedia :: Graphics :: Viewers",
diff --git a/src/tests/geometry/pointcloud.cpp b/src/tests/geometry/pointcloud.cpp
index eff0ca3a..d66a25c9 100644
--- a/src/tests/geometry/pointcloud.cpp
+++ b/src/tests/geometry/pointcloud.cpp
@@ -178,18 +178,25 @@ TEST(PointCloud, GetOrientedBoundingBox) {
     geometry::OrientedBoundingBox obb;
 
     // Valid 4 points
-    pcd = geometry::PointCloud({{0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {1, 1, 1}});
+    thrust::host_vector<Vector3f> points;
+    points.push_back({0, 0, 0});
+    points.push_back({0, 0, 1});
+    points.push_back({0, 1, 0});
+    points.push_back({1, 1, 1});
+    pcd = geometry::PointCloud(points);
     pcd.GetOrientedBoundingBox();
 
     // 8 points with known ground truth
-    pcd = geometry::PointCloud({{0, 0, 0},
-                                {0, 0, 1},
-                                {0, 2, 0},
-                                {0, 2, 1},
-                                {3, 0, 0},
-                                {3, 0, 1},
-                                {3, 2, 0},
-                                {3, 2, 1}});
+    points.clear();
+    points.push_back({0, 0, 0});
+    points.push_back({0, 0, 1});
+    points.push_back({0, 2, 0});
+    points.push_back({0, 2, 1});
+    points.push_back({3, 0, 0});
+    points.push_back({3, 0, 1});
+    points.push_back({3, 2, 0});
+    points.push_back({3, 2, 1});
+    pcd = geometry::PointCloud(points);
     obb = pcd.GetOrientedBoundingBox();
     EXPECT_EQ(obb.center_, Eigen::Vector3f(1.5, 1, 0.5));
     EXPECT_EQ(obb.extent_, Eigen::Vector3f(3, 2, 1));
diff --git a/src/tests/knn/lbvh_knn.cpp b/src/tests/knn/lbvh_knn.cpp
new file mode 100644
index 00000000..d171ae48
--- /dev/null
+++ b/src/tests/knn/lbvh_knn.cpp
@@ -0,0 +1,86 @@
+/**
+ * Copyright (c) 2020 Neka-Nat
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+**/
+#include "cupoch/knn/lbvh_knn.h"
+
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+
+#include "cupoch/geometry/pointcloud.h"
+#include "cupoch/geometry/geometry_utils.h"
+#include "tests/test_utility/unit_test.h"
+
+using namespace Eigen;
+using namespace cupoch;
+using namespace std;
+using namespace unit_test;
+
+namespace {
+
+struct is_minus_one_functor {
+    bool operator()(int x) const { return (x == -1); }
+};
+
+struct is_inf_functor {
+    bool operator()(float x) const { return std::isinf(x); }
+};
+
+}  // namespace
+
+TEST(LinearBoundingVolumeHierarchyKNN, SearchKNN) {
+    thrust::host_vector<int> ref_indices;
+    int indices0[] = {27, 48, 4,  77, 90, 7,  54, 17, 76, 38,
+                      39, 60, 15, 84, 11, 57, 3,  32, 99, 36,
+                      52, 40, 26, 59, 22, 97, 20, 42, 73, 24};
+    for (int i = 0; i < 30; ++i) ref_indices.push_back(indices0[i]);
+
+    thrust::host_vector<float> ref_distance2;
+    float distances0[] = {
+            0.000000,  4.684353,  4.996539,  9.191849,  10.034604, 10.466745,
+            10.649751, 11.434066, 12.089195, 13.345638, 13.696270, 14.016148,
+            16.851978, 17.073435, 18.254518, 20.019994, 21.496347, 23.077277,
+            23.692427, 23.809303, 24.104578, 25.005770, 26.952710, 27.487888,
+            27.998463, 28.262975, 28.581313, 28.816608, 31.603230, 31.610916};
+    for (int i = 0; i < 30; ++i) ref_distance2.push_back(distances0[i]);
+
+    int size = 100;
+
+    geometry::PointCloud pc;
+
+    Vector3f vmin(0.0, 0.0, 0.0);
+    Vector3f vmax(10.0, 10.0, 10.0);
+
+    thrust::host_vector<Eigen::Vector3f> points(size);
+    Rand(points, vmin, vmax, 0);
+    pc.SetPoints(points);
+
+    knn::LinearBoundingVolumeHierarchyKNN kdtree(geometry::ConvertVector3fVectorRef(pc));
+
+    Eigen::Vector3f query = {1.647059, 4.392157, 8.784314};
+    thrust::host_vector<unsigned int> indices;
+    thrust::host_vector<float> distance2;
+
+    int result = kdtree.SearchNN(query, std::numeric_limits<float>::max(), indices, distance2);
+
+    EXPECT_EQ(result, 1);
+
+    EXPECT_EQ(ref_indices[0], indices[0]);
+    EXPECT_NEAR(ref_distance2[0], distance2[0], 1.0e-9);
+}
diff --git a/third_party/lbvh_index/lbvh_index/query.cuh b/third_party/lbvh_index/lbvh_index/query.cuh
index 76e44613..5fa2e50c 100644
--- a/third_party/lbvh_index/lbvh_index/query.cuh
+++ b/third_party/lbvh_index/lbvh_index/query.cuh
@@ -4,9 +4,8 @@
 #include "vec_math.h"
 
 // default is one nearest neighbor
-#ifndef K
 #define K 1
-#endif
+
 
 namespace lbvh {
 
diff --git a/third_party/lbvh_index/lbvh_index/query_knn.cuh b/third_party/lbvh_index/lbvh_index/query_knn.cuh
index 2626a8e9..e5ee095b 100644
--- a/third_party/lbvh_index/lbvh_index/query_knn.cuh
+++ b/third_party/lbvh_index/lbvh_index/query_knn.cuh
@@ -3,9 +3,6 @@
 #include "static_priorityqueue.cuh"
 
 // default is one nearest neighbor
-#ifndef K
-#define K 1
-#endif
 
 namespace lbvh {
     __device__ void query_knn(const BVHNode* __restrict__ nodes,