From 2dfd5cce084a5be76373871ac57ba4401831aea9 Mon Sep 17 00:00:00 2001
From: nekanat <nekanat.stock@gmail.com>
Date: Tue, 19 Nov 2019 23:11:59 +0900
Subject: [PATCH] kdtree cuda

---
 CMakeLists.txt                           |   8 +-
 LICENSE                                  |  21 +
 src/cupoc/geometry/CMakeLists.txt        |   8 +-
 src/cupoc/geometry/down_sample.cu        | 136 ++++
 src/cupoc/geometry/estimate_normals.cu   | 273 +++++++-
 src/cupoc/geometry/kdtree_flann.cu       | 182 +++---
 src/cupoc/geometry/kdtree_flann.h        |  26 +-
 src/cupoc/geometry/kdtree_search_param.h |   4 +
 src/cupoc/geometry/pointcloud.cu         | 125 ----
 src/cupoc/geometry/pointcloud.h          |   1 +
 src/cupoc/io/class_io/pointcloud_io.cpp  | 104 ++++
 src/cupoc/io/class_io/pointcloud_io.h    |  50 ++
 src/cupoc/io/file_format/file_pcd.cpp    | 755 +++++++++++++++++++++++
 src/cupoc/utility/filesystem.cpp         |  25 +
 src/cupoc/utility/filesystem.h           |  14 +
 src/cupoc/utility/helper.h               |  52 ++
 third_party/CMakeLists.txt               |  43 +-
 third_party/liblzf/CMakeLists.txt        |   6 +
 third_party/liblzf/lzf.h                 | 108 ++++
 third_party/liblzf/lzfP.h                | 190 ++++++
 third_party/liblzf/lzf_c.c               | 299 +++++++++
 third_party/liblzf/lzf_d.c               | 193 ++++++
 22 files changed, 2403 insertions(+), 220 deletions(-)
 create mode 100644 LICENSE
 create mode 100644 src/cupoc/geometry/down_sample.cu
 create mode 100644 src/cupoc/io/class_io/pointcloud_io.cpp
 create mode 100644 src/cupoc/io/class_io/pointcloud_io.h
 create mode 100644 src/cupoc/io/file_format/file_pcd.cpp
 create mode 100644 src/cupoc/utility/filesystem.cpp
 create mode 100644 src/cupoc/utility/filesystem.h
 create mode 100644 third_party/liblzf/CMakeLists.txt
 create mode 100644 third_party/liblzf/lzf.h
 create mode 100644 third_party/liblzf/lzfP.h
 create mode 100644 third_party/liblzf/lzf_c.c
 create mode 100644 third_party/liblzf/lzf_d.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4342782f..7eef3ce8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,7 +106,13 @@ set(CUDA_ARCH -gencode arch=compute_30,code=sm_30
               -gencode arch=compute_60,code=sm_60
               -gencode arch=compute_61,code=sm_61
               -gencode arch=compute_70,code=sm_70)
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --expt-relaxed-constexpr --expt-extended-lambda ${CUDA_ARCH})
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}
+    --expt-relaxed-constexpr
+    --expt-extended-lambda
+    -Xcudafe "--diag_suppress=integer_sign_change"
+    -Xcudafe "--diag_suppress=partial_override"
+    -Xcudafe "--diag_suppress=virtual_function_decl_hidden"
+    ${CUDA_ARCH})
 
 add_subdirectory(third_party)
 
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..87253f6e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 neka-nat
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/src/cupoc/geometry/CMakeLists.txt b/src/cupoc/geometry/CMakeLists.txt
index 959425d4..3d7d246e 100644
--- a/src/cupoc/geometry/CMakeLists.txt
+++ b/src/cupoc/geometry/CMakeLists.txt
@@ -1,2 +1,6 @@
-cuda_add_library(cupoc_geometry pointcloud.cu estimate_normals.cu geometry3d.cu kdtree_flann.cu)
-target_link_libraries(cupoc_geometry cupoc_utility)
\ No newline at end of file
+cuda_add_library(cupoc_geometry pointcloud.cu
+                 estimate_normals.cu
+                 down_sample.cu
+                 geometry3d.cu
+                 kdtree_flann.cu)
+target_link_libraries(cupoc_geometry cupoc_utility flann_cuda)
\ No newline at end of file
diff --git a/src/cupoc/geometry/down_sample.cu b/src/cupoc/geometry/down_sample.cu
new file mode 100644
index 00000000..3f47aef1
--- /dev/null
+++ b/src/cupoc/geometry/down_sample.cu
@@ -0,0 +1,136 @@
+#include "cupoc/geometry/pointcloud.h"
+#include "cupoc/geometry/geometry3d.h"
+#include "cupoc/utility/console.h"
+#include "cupoc/utility/helper.h"
+#include <thrust/gather.h>
+
+
+using namespace cupoc;
+using namespace cupoc::geometry;
+
+namespace {
+
+struct compute_key_functor {
+    compute_key_functor(const Eigen::Vector3f& voxel_min_bound, float voxel_size)
+        : voxel_min_bound_(voxel_min_bound), voxel_size_(voxel_size) {};
+    const Eigen::Vector3f voxel_min_bound_;
+    const float voxel_size_;
+    __device__
+    Eigen::Vector3i operator()(const Eigen::Vector3f_u& pt) {
+        auto ref_coord = (pt - voxel_min_bound_) / voxel_size_;
+        return Eigen::Vector3i(int(floor(ref_coord(0))), int(floor(ref_coord(1))), int(floor(ref_coord(2))));
+    }
+};
+
+template<typename OutputIterator, class... Args>
+__host__
+int CalcAverageByKey(thrust::device_vector<Eigen::Vector3i>& keys,
+                     OutputIterator buf_begins, OutputIterator output_begins) {
+    const size_t n = keys.size();
+    thrust::sort_by_key(keys.begin(), keys.end(), buf_begins);
+
+    thrust::device_vector<Eigen::Vector3i> keys_out(n);
+    thrust::device_vector<int> counts(n);
+    auto end1 = thrust::reduce_by_key(keys.begin(), keys.end(),
+                                      thrust::make_constant_iterator(1),
+                                      keys_out.begin(), counts.begin());
+    int n_out = static_cast<int>(end1.second - counts.begin());
+    counts.resize(n_out);
+
+    thrust::equal_to<Eigen::Vector3i> binary_pred;
+    add_tuple_functor<Args...> add_func;
+    auto end2 = thrust::reduce_by_key(keys.begin(), keys.end(), buf_begins,
+                                      keys_out.begin(), output_begins,
+                                      binary_pred, add_func);
+
+    devided_tuple_functor<Args...> dv_func;
+    thrust::transform(output_begins, output_begins + n_out,
+                      counts.begin(), output_begins,
+                      dv_func);
+    return n_out;
+}
+
+}
+
+utility::shared_ptr<PointCloud> PointCloud::SelectDownSample(const thrust::device_vector<size_t> &indices, bool invert) const {
+    auto output = utility::shared_ptr<PointCloud>(new PointCloud());
+    const bool has_normals = HasNormals();
+    const bool has_colors = HasColors();
+
+    output->points_.resize(indices.size());
+    thrust::gather(indices.begin(), indices.end(), points_.begin(), output->points_.begin());
+    return output;
+}
+
+utility::shared_ptr<PointCloud> PointCloud::VoxelDownSample(float voxel_size) const {
+    auto output = utility::shared_ptr<PointCloud>(new PointCloud());
+    if (voxel_size <= 0.0) {
+        utility::LogWarning("[VoxelDownSample] voxel_size <= 0.\n");
+        return output;
+    }
+
+    const Eigen::Vector3f voxel_size3 = Eigen::Vector3f(voxel_size, voxel_size, voxel_size);
+    const Eigen::Vector3f voxel_min_bound = GetMinBound() - voxel_size3 * 0.5;
+    const Eigen::Vector3f voxel_max_bound = GetMaxBound() + voxel_size3 * 0.5;
+
+    if (voxel_size * std::numeric_limits<int>::max() < (voxel_max_bound - voxel_min_bound).maxCoeff()) {
+        utility::LogWarning("[VoxelDownSample] voxel_size is too small.\n");
+        return output;
+    }
+
+    const int n = points_.size();
+    const bool has_normals = HasNormals();
+    const bool has_colors = HasColors();
+    compute_key_functor ck_func(voxel_min_bound, voxel_size);
+    thrust::device_vector<Eigen::Vector3i> keys(n);
+    thrust::transform(points_.begin(), points_.end(), keys.begin(), ck_func);
+
+    thrust::device_vector<Eigen::Vector3f_u> sorted_points = points_;
+    output->points_.resize(n);
+    if (!has_normals && !has_colors) {
+        typedef thrust::tuple<thrust::device_vector<Eigen::Vector3f_u>::iterator> IteratorTuple;
+        typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+        auto n_out = CalcAverageByKey<ZipIterator, Eigen::Vector3f_u>(keys,
+                    thrust::make_zip_iterator(thrust::make_tuple(sorted_points.begin())),
+                    thrust::make_zip_iterator(thrust::make_tuple(output->points_.begin())));
+        output->points_.resize(n_out);
+    } else if (has_normals && !has_colors) {
+        thrust::device_vector<Eigen::Vector3f_u> sorted_normals = normals_;
+        output->normals_.resize(n);
+        typedef thrust::tuple<thrust::device_vector<Eigen::Vector3f_u>::iterator, thrust::device_vector<Eigen::Vector3f_u>::iterator> IteratorTuple;
+        typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+        auto n_out = CalcAverageByKey<ZipIterator, Eigen::Vector3f_u, Eigen::Vector3f_u>(keys,
+                    thrust::make_zip_iterator(thrust::make_tuple(sorted_points.begin(), sorted_normals.begin())),
+                    thrust::make_zip_iterator(thrust::make_tuple(output->points_.begin(), output->normals_.begin())));
+        output->points_.resize(n_out);
+        output->normals_.resize(n_out);
+    } else if (!has_normals && has_colors) {
+        thrust::device_vector<Eigen::Vector3f_u> sorted_colors = colors_;
+        output->colors_.resize(n);
+        typedef thrust::tuple<thrust::device_vector<Eigen::Vector3f_u>::iterator, thrust::device_vector<Eigen::Vector3f_u>::iterator> IteratorTuple;
+        typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+        auto n_out = CalcAverageByKey<ZipIterator, Eigen::Vector3f_u, Eigen::Vector3f_u>(keys,
+                    thrust::make_zip_iterator(thrust::make_tuple(sorted_points.begin(), sorted_colors.begin())),
+                    thrust::make_zip_iterator(thrust::make_tuple(output->points_.begin(), output->colors_.begin())));
+        output->points_.resize(n_out);
+        output->colors_.resize(n_out);
+    } else {
+        thrust::device_vector<Eigen::Vector3f_u> sorted_normals = normals_;
+        thrust::device_vector<Eigen::Vector3f_u> sorted_colors = colors_;
+        output->normals_.resize(n);
+        output->colors_.resize(n);
+        typedef thrust::tuple<thrust::device_vector<Eigen::Vector3f_u>::iterator, thrust::device_vector<Eigen::Vector3f_u>::iterator, thrust::device_vector<Eigen::Vector3f_u>::iterator> IteratorTuple;
+        typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+        auto n_out = CalcAverageByKey<ZipIterator, Eigen::Vector3f_u, Eigen::Vector3f_u, Eigen::Vector3f_u>(keys,
+                    thrust::make_zip_iterator(thrust::make_tuple(sorted_points.begin(), sorted_normals.begin(), sorted_colors.begin())),
+                    thrust::make_zip_iterator(thrust::make_tuple(output->points_.begin(), output->normals_.begin(), output->colors_.begin())));
+        output->points_.resize(n_out);
+        output->normals_.resize(n_out);
+        output->colors_.resize(n_out);
+    }
+
+    utility::LogDebug(
+            "Pointcloud down sampled from {:d} points to {:d} points.\n",
+            (int)points_.size(), (int)output->points_.size());
+    return output;
+}
diff --git a/src/cupoc/geometry/estimate_normals.cu b/src/cupoc/geometry/estimate_normals.cu
index 7aaa12fc..2c42d781 100644
--- a/src/cupoc/geometry/estimate_normals.cu
+++ b/src/cupoc/geometry/estimate_normals.cu
@@ -1,9 +1,262 @@
+#include <Eigen/Geometry>
 #include "cupoc/geometry/pointcloud.h"
 #include "cupoc/geometry/kdtree_flann.h"
+#include "cupoc/utility/console.h"
 
 using namespace cupoc;
 using namespace cupoc::geometry;
 
+namespace {
+
+__device__
+Eigen::Vector3f ComputeEigenvector0(const Eigen::Matrix3f &A, float eval0) {
+    Eigen::Vector3f row0(A(0, 0) - eval0, A(0, 1), A(0, 2));
+    Eigen::Vector3f row1(A(0, 1), A(1, 1) - eval0, A(1, 2));
+    Eigen::Vector3f row2(A(0, 2), A(1, 2), A(2, 2) - eval0);
+    Eigen::Vector3f r0xr1 = row0.cross(row1);
+    Eigen::Vector3f r0xr2 = row0.cross(row2);
+    Eigen::Vector3f r1xr2 = row1.cross(row2);
+    float d0 = r0xr1.dot(r0xr1);
+    float d1 = r0xr2.dot(r0xr2);
+    float d2 = r1xr2.dot(r1xr2);
+
+    float dmax = d0;
+    int imax = 0;
+    if (d1 > dmax) {
+        dmax = d1;
+        imax = 1;
+    }
+    if (d2 > dmax) {
+        imax = 2;
+    }
+
+    if (imax == 0) {
+        return r0xr1 / std::sqrt(d0);
+    } else if (imax == 1) {
+        return r0xr2 / std::sqrt(d1);
+    } else {
+        return r1xr2 / std::sqrt(d2);
+    }
+}
+    
+__device__
+Eigen::Vector3f ComputeEigenvector1(const Eigen::Matrix3f &A,
+                                    const Eigen::Vector3f &evec0,
+                                    double eval1) {
+    Eigen::Vector3f U, V;
+    if (std::abs(evec0(0)) > std::abs(evec0(1))) {
+        float inv_length =
+                1 / std::sqrt(evec0(0) * evec0(0) + evec0(2) * evec0(2));
+        U << -evec0(2) * inv_length, 0, evec0(0) * inv_length;
+    } else {
+        float inv_length =
+                1 / std::sqrt(evec0(1) * evec0(1) + evec0(2) * evec0(2));
+        U << 0, evec0(2) * inv_length, -evec0(1) * inv_length;
+    }
+    V = evec0.cross(U);
+
+    Eigen::Vector3f AU(A(0, 0) * U(0) + A(0, 1) * U(1) + A(0, 2) * U(2),
+                      A(0, 1) * U(0) + A(1, 1) * U(1) + A(1, 2) * U(2),
+                      A(0, 2) * U(0) + A(1, 2) * U(1) + A(2, 2) * U(2));
+
+    Eigen::Vector3f AV = {A(0, 0) * V(0) + A(0, 1) * V(1) + A(0, 2) * V(2),
+                          A(0, 1) * V(0) + A(1, 1) * V(1) + A(1, 2) * V(2),
+                          A(0, 2) * V(0) + A(1, 2) * V(1) + A(2, 2) * V(2)};
+    
+    float m00 = U(0) * AU(0) + U(1) * AU(1) + U(2) * AU(2) - eval1;
+    float m01 = U(0) * AV(0) + U(1) * AV(1) + U(2) * AV(2);
+    float m11 = V(0) * AV(0) + V(1) * AV(1) + V(2) * AV(2) - eval1;
+    
+    float absM00 = std::abs(m00);
+    float absM01 = std::abs(m01);
+    float absM11 = std::abs(m11);
+    float max_abs_comp;
+    if (absM00 >= absM11) {
+        max_abs_comp = max(absM00, absM01);
+        if (max_abs_comp > 0) {
+            if (absM00 >= absM01) {
+                m01 /= m00;
+                m00 = 1 / std::sqrt(1 + m01 * m01);
+                m01 *= m00;
+            } else {
+                m00 /= m01;
+                m01 = 1 / std::sqrt(1 + m00 * m00);
+                m00 *= m01;
+            }
+            return m01 * U - m00 * V;
+        } else {
+            return U;
+        }
+    } else {
+        max_abs_comp = max(absM11, absM01);
+        if (max_abs_comp > 0) {
+            if (absM11 >= absM01) {
+                m01 /= m11;
+                m11 = 1 / std::sqrt(1 + m01 * m01);
+                m01 *= m11;
+            } else {
+                m11 /= m01;
+                m01 = 1 / std::sqrt(1 + m11 * m11);
+                m11 *= m01;
+            }
+            return m11 * U - m01 * V;
+        } else {
+            return U;
+        }
+    }
+}
+
+__device__
+Eigen::Vector3f FastEigen3x3(Eigen::Matrix3f &A) {
+    // Previous version based on:
+    // https://en.wikipedia.org/wiki/Eigenvalue_algorithm#3.C3.973_matrices
+    // Current version based on
+    // https://www.geometrictools.com/Documentation/RobustEigenSymmetric3x3.pdf
+    // which handles edge cases like points on a plane
+
+    float max_coeff = A.maxCoeff();
+    if (max_coeff == 0) {
+        return Eigen::Vector3f::Zero();
+    }
+    A /= max_coeff;
+
+    float norm = A(0, 1) * A(0, 1) + A(0, 2) * A(0, 2) + A(1, 2) * A(1, 2);
+    if (norm > 0) {
+        Eigen::Vector3f eval;
+        Eigen::Vector3f evec0;
+        Eigen::Vector3f evec1;
+        Eigen::Vector3f evec2;
+
+        float q = (A(0, 0) + A(1, 1) + A(2, 2)) / 3;
+
+        float b00 = A(0, 0) - q;
+        float b11 = A(1, 1) - q;
+        float b22 = A(2, 2) - q;
+
+        float p =
+                std::sqrt((b00 * b00 + b11 * b11 + b22 * b22 + norm * 2) / 6);
+
+        float c00 = b11 * b22 - A(1, 2) * A(1, 2);
+        float c01 = A(0, 1) * b22 - A(1, 2) * A(0, 2);
+        float c02 = A(0, 1) * A(1, 2) - b11 * A(0, 2);
+        float det = (b00 * c00 - A(0, 1) * c01 + A(0, 2) * c02) / (p * p * p);
+
+        float half_det = det * 0.5;
+        half_det = min(max(half_det, -1.0), 1.0);
+
+        float angle = std::acos(half_det) / (float)3;
+        float const two_thirds_pi = 2.09439510239319549;
+        float beta2 = std::cos(angle) * 2;
+        float beta0 = std::cos(angle + two_thirds_pi) * 2;
+        float beta1 = -(beta0 + beta2);
+
+        eval(0) = q + p * beta0;
+        eval(1) = q + p * beta1;
+        eval(2) = q + p * beta2;
+
+        if (half_det >= 0) {
+            evec2 = ComputeEigenvector0(A, eval(2));
+            if (eval(2) < eval(0) && eval(2) < eval(1)) {
+                A *= max_coeff;
+                return evec2;
+            }
+            evec1 = ComputeEigenvector1(A, evec2, eval(1));
+            A *= max_coeff;
+            if (eval(1) < eval(0) && eval(1) < eval(2)) {
+                return evec1;
+            }
+            evec0 = evec1.cross(evec2);
+            return evec0;
+        } else {
+            evec0 = ComputeEigenvector0(A, eval(0));
+            if (eval(0) < eval(1) && eval(0) < eval(2)) {
+                A *= max_coeff;
+                return evec0;
+            }
+            evec1 = ComputeEigenvector1(A, evec0, eval(1));
+            A *= max_coeff;
+            if (eval(1) < eval(0) && eval(1) < eval(2)) {
+                return evec1;
+            }
+            evec2 = evec0.cross(evec1);
+            return evec2;
+        }
+    } else {
+        A *= max_coeff;
+        if (A(0, 0) < A(1, 1) && A(0, 0) < A(2, 2)) {
+            return Eigen::Vector3f(1, 0, 0);
+        } else if (A(1, 1) < A(0, 0) && A(1, 1) < A(2, 2)) {
+            return Eigen::Vector3f(0, 1, 0);
+        } else {
+            return Eigen::Vector3f(0, 0, 1);
+        }
+    }
+}
+
+__device__
+Eigen::Vector3f_u ComputeNormal(const Eigen::Vector3f_u* points,
+                                const KNNIndices &indices, int knn) {
+    if (indices[0] < 0) return Eigen::Vector3f_u(0.0, 0.0, 1.0);
+
+    Eigen::Matrix3f covariance;
+    Eigen::Matrix<float, 9, 1> cumulants;
+    cumulants.setZero();
+    for (size_t i = 0; i < knn; i++) {
+        const Eigen::Vector3f_u& point = points[indices[i]];
+        cumulants(0) += point(0);
+        cumulants(1) += point(1);
+        cumulants(2) += point(2);
+        cumulants(3) += point(0) * point(0);
+        cumulants(4) += point(0) * point(1);
+        cumulants(5) += point(0) * point(2);
+        cumulants(6) += point(1) * point(1);
+        cumulants(7) += point(1) * point(2);
+        cumulants(8) += point(2) * point(2);
+    }
+    cumulants /= (double)indices.size();
+    covariance(0, 0) = cumulants(3) - cumulants(0) * cumulants(0);
+    covariance(1, 1) = cumulants(6) - cumulants(1) * cumulants(1);
+    covariance(2, 2) = cumulants(8) - cumulants(2) * cumulants(2);
+    covariance(0, 1) = cumulants(4) - cumulants(0) * cumulants(1);
+    covariance(1, 0) = covariance(0, 1);
+    covariance(0, 2) = cumulants(5) - cumulants(0) * cumulants(2);
+    covariance(2, 0) = covariance(0, 2);
+    covariance(1, 2) = cumulants(7) - cumulants(1) * cumulants(2);
+    covariance(2, 1) = covariance(1, 2);
+
+    return FastEigen3x3(covariance);
+}
+
+struct compute_normal_functor {
+    compute_normal_functor(const Eigen::Vector3f_u* points, int knn) : points_(points), knn_(knn) {};
+    const Eigen::Vector3f_u* points_;
+    const int knn_;
+    __device__
+    Eigen::Vector3f_u operator()(const KNNIndices& x) const {
+        Eigen::Vector3f_u normal = ComputeNormal(points_, x, knn_);
+        if (normal.norm() == 0.0) {
+            normal = Eigen::Vector3f_u(0.0, 0.0, 1.0);
+        }
+        return normal;
+    }
+};
+
+struct align_normals_direction {
+    align_normals_direction(const Eigen::Vector3f& orientation_reference)
+        : orientation_reference_(orientation_reference) {};
+    const Eigen::Vector3f orientation_reference_;
+    __device__
+    void operator()(Eigen::Vector3f_u& normal) const {
+        if (normal.norm() == 0.0) {
+            normal = orientation_reference_;
+        } else if (normal.dot(orientation_reference_) < 0.0) {
+            normal *= -1.0;
+        }
+    }
+};
+
+}
+
 bool PointCloud::EstimateNormals(const KDTreeSearchParam &search_param) {
     const bool has_normal = HasNormals();
     if (HasNormals() == false) {
@@ -11,5 +264,23 @@ bool PointCloud::EstimateNormals(const KDTreeSearchParam &search_param) {
     }
     KDTreeFlann kdtree;
     kdtree.SetGeometry(*this);
+    thrust::device_vector<KNNIndices> indices;
+    thrust::device_vector<KNNDistances> distance2;
+    kdtree.Search(points_, search_param, indices, distance2);
+    normals_.resize(points_.size());
+    compute_normal_functor func(thrust::raw_pointer_cast(points_.data()), ((const KDTreeSearchParamKNN &)search_param).knn_);
+    thrust::transform(indices.begin(), indices.end(), normals_.begin(), func);
+    return true;
+}
+
+bool PointCloud::OrientNormalsToAlignWithDirection(const Eigen::Vector3f &orientation_reference) {
+    if (HasNormals() == false) {
+        utility::LogWarning(
+                "[OrientNormalsToAlignWithDirection] No normals in the "
+                "PointCloud. Call EstimateNormals() first.\n");
+        return false;
+    }
+    align_normals_direction func(orientation_reference);
+    thrust::for_each(normals_.begin(), normals_.end(), func);
     return true;
-}
\ No newline at end of file
+}
diff --git a/src/cupoc/geometry/kdtree_flann.cu b/src/cupoc/geometry/kdtree_flann.cu
index 9c6622ac..800bc58f 100644
--- a/src/cupoc/geometry/kdtree_flann.cu
+++ b/src/cupoc/geometry/kdtree_flann.cu
@@ -1,11 +1,48 @@
 
 #include "cupoc/geometry/kdtree_flann.h"
+#define FLANN_USE_CUDA
 #include <flann/flann.hpp>
 #include "cupoc/utility/console.h"
 
+using namespace cupoc;
+using namespace cupoc::geometry;
+
+namespace {
+
+template<typename ElementType, typename ArrayType>
+struct indexed_copy_functor {
+    indexed_copy_functor(int index)
+        : index_(index) {};
+    const int index_;
+    __device__
+    ArrayType operator()(const ElementType& x, const ArrayType& idxs) {
+        ArrayType ans = idxs;
+        ans[index_] = x;
+        return ans;
+    }
+};
+
+void ConvertIndicesAndDistaces(int knn, int n_query, thrust::device_vector<int> indices_dv,
+                               thrust::device_vector<float> distance2_dv,
+                               thrust::device_vector<KNNIndices> &indices,
+                               thrust::device_vector<KNNDistances> &distance2) {
+    const int total_size = knn * n_query;
+    indices.resize(n_query);
+    distance2.resize(n_query);
+    thrust::for_each(indices.begin(), indices.end(), [] __device__ (KNNIndices& idxs){idxs.fill(-1);});
+    thrust::for_each(distance2.begin(), distance2.end(), [] __device__ (KNNDistances& dist){dist.fill(-1.0);});
+    for (int k = 0; k < knn; ++k) {
+        thrust::strided_range<thrust::device_vector<int>::iterator> itr_idxs(indices_dv.begin() + k, indices_dv.begin() + total_size + 1 - knn + k, knn);
+        indexed_copy_functor<int, KNNIndices> icf_i(k);
+        thrust::transform(itr_idxs.begin(), itr_idxs.end(), indices.begin(), indices.begin(), icf_i);
+        thrust::strided_range<thrust::device_vector<float>::iterator> itr_dist(distance2_dv.begin() + k, distance2_dv.begin() + total_size + 1 - knn + k, knn);
+        indexed_copy_functor<float, KNNDistances> icf_d(k);
+        thrust::transform(itr_dist.begin(), itr_dist.end(), distance2.begin(), distance2.begin(), icf_d);
+    }
+}
+
+}
 
-namespace cupoc {
-namespace geometry {
 
 KDTreeFlann::KDTreeFlann() {}
 
@@ -25,10 +62,10 @@ bool KDTreeFlann::SetGeometry(const PointCloud &geometry) {
 }
 
 template <typename T>
-int KDTreeFlann::Search(const T &query,
+int KDTreeFlann::Search(const thrust::device_vector<T> &query,
                         const KDTreeSearchParam &param,
-                        thrust::device_vector<int> &indices,
-                        thrust::device_vector<float> &distance2) const {
+                        thrust::device_vector<KNNIndices> &indices,
+                        thrust::device_vector<KNNDistances> &distance2) const {
     switch (param.GetSearchType()) {
         case KDTreeSearchParam::SearchType::Knn:
             return SearchKNN(query, ((const KDTreeSearchParamKNN &)param).knn_,
@@ -49,80 +86,76 @@ int KDTreeFlann::Search(const T &query,
 }
 
 template <typename T>
-int KDTreeFlann::SearchKNN(const T &query,
+int KDTreeFlann::SearchKNN(const thrust::device_vector<T> &query,
                            int knn,
-                           thrust::device_vector<int> &indices,
-                           thrust::device_vector<float> &distance2) const {
+                           thrust::device_vector<KNNIndices> &indices,
+                           thrust::device_vector<KNNDistances> &distance2) const {
     // This is optimized code for heavily repeated search.
     // Other flann::Index::knnSearch() implementations lose performance due to
     // memory allocation/deallocation.
-    if (data_.empty() || dataset_size_ <= 0 ||
-        size_t(query.rows()) != dimension_ || knn < 0) {
-        return -1;
-    }
-    flann::Matrix<float> query_flann((float *)query.data(), 1, dimension_);
-    indices.resize(knn);
-    distance2.resize(knn);
-    flann::Matrix<int> indices_flann(thrust::raw_pointer_cast(indices.data()), query_flann.rows, knn);
-    flann::Matrix<float> dists_flann(thrust::raw_pointer_cast(distance2.data()), query_flann.rows, knn);
+    if (data_.empty() || query.empty() || dataset_size_ <= 0 || knn < 0 || knn > NUM_MAX_NN) return -1;
+    T query0 = query[0];
+    if (size_t(query0.size()) != dimension_) return -1;
+    flann::Matrix<float> query_flann((float *)(thrust::raw_pointer_cast(query.data())), query.size(), dimension_);
+    const int total_size = query.size() * knn;
+    thrust::device_vector<int> indices_dv(total_size);
+    thrust::device_vector<float> distance2_dv(total_size);
+    flann::Matrix<int> indices_flann(thrust::raw_pointer_cast(indices_dv.data()), query_flann.rows, knn);
+    flann::Matrix<float> dists_flann(thrust::raw_pointer_cast(distance2_dv.data()), query_flann.rows, knn);
     int k = flann_index_->knnSearch(query_flann, indices_flann, dists_flann,
                                     knn, flann::SearchParams(-1, 0.0));
-    indices.resize(k);
-    distance2.resize(k);
+    ConvertIndicesAndDistaces(knn, query.size(), indices_dv, distance2_dv, indices, distance2);
     return k;
 }
 
 template <typename T>
-int KDTreeFlann::SearchRadius(const T &query,
+int KDTreeFlann::SearchRadius(const thrust::device_vector<T> &query,
                               float radius,
-                              thrust::device_vector<int> &indices,
-                              thrust::device_vector<float> &distance2) const {
+                              thrust::device_vector<KNNIndices> &indices,
+                              thrust::device_vector<KNNDistances> &distance2) const {
     // This is optimized code for heavily repeated search.
     // Since max_nn is not given, we let flann to do its own memory management.
     // Other flann::Index::radiusSearch() implementations lose performance due
     // to memory management and CPU caching.
-    if (data_.empty() || dataset_size_ <= 0 ||
-        size_t(query.rows()) != dimension_) {
-        return -1;
-    }
-    flann::Matrix<float> query_flann((float *)query.data(), 1, dimension_);
+    if (data_.empty() || query.empty() || dataset_size_ <= 0) return -1;
+    T query0 = query[0];
+    if (size_t(query0.size()) != dimension_) return -1;
+    flann::Matrix<float> query_flann((float *)(thrust::raw_pointer_cast(query.data())), query.size(), dimension_);
     flann::SearchParams param(-1, 0.0);
-    param.max_neighbors = -1;
-    std::vector<std::vector<int>> indices_vec(1);
-    std::vector<std::vector<float>> dists_vec(1);
-    int k = flann_index_->radiusSearch(query_flann, indices_vec, dists_vec,
+    param.max_neighbors = NUM_MAX_NN;
+    thrust::device_vector<int> indices_dv(query.size() * NUM_MAX_NN);
+    thrust::device_vector<float> distance2_dv(query.size() * NUM_MAX_NN);
+    flann::Matrix<int> indices_flann(thrust::raw_pointer_cast(indices_dv.data()), query_flann.rows, NUM_MAX_NN);
+    flann::Matrix<float> dists_flann(thrust::raw_pointer_cast(distance2_dv.data()), query_flann.rows, NUM_MAX_NN);
+    int k = flann_index_->radiusSearch(query_flann, indices_flann, dists_flann,
                                        float(radius * radius), param);
-    indices = indices_vec[0];
-    distance2 = dists_vec[0];
+    ConvertIndicesAndDistaces(NUM_MAX_NN, query.size(), indices_dv, distance2_dv, indices, distance2);
     return k;
 }
 
 template <typename T>
-int KDTreeFlann::SearchHybrid(const T &query,
+int KDTreeFlann::SearchHybrid(const thrust::device_vector<T> &query,
                               float radius,
                               int max_nn,
-                              thrust::device_vector<int> &indices,
-                              thrust::device_vector<float> &distance2) const {
+                              thrust::device_vector<KNNIndices> &indices,
+                              thrust::device_vector<KNNDistances> &distance2) const {
     // This is optimized code for heavily repeated search.
     // It is also the recommended setting for search.
     // Other flann::Index::radiusSearch() implementations lose performance due
     // to memory allocation/deallocation.
-    if (data_.empty() || dataset_size_ <= 0 ||
-        size_t(query.rows()) != dimension_ || max_nn < 0) {
-        return -1;
-    }
-    flann::Matrix<float> query_flann((float *)query.data(), 1, dimension_);
+    if (data_.empty() || query.empty() || dataset_size_ <= 0 || max_nn < 0 || max_nn > NUM_MAX_NN) return -1;
+    T query0 = query[0];
+    if (size_t(query0.size()) != dimension_) return -1;
+    flann::Matrix<float> query_flann((float *)(thrust::raw_pointer_cast(query.data())), query.size(), dimension_);
     flann::SearchParams param(-1, 0.0);
     param.max_neighbors = max_nn;
-    indices.resize(max_nn);
-    distance2.resize(max_nn);
-    flann::Matrix<int> indices_flann(thrust::raw_pointer_cast(indices.data()), query_flann.rows, max_nn);
-    flann::Matrix<float> dists_flann(thrust::raw_pointer_cast(distance2.data()), query_flann.rows,
-                                      max_nn);
+    thrust::device_vector<int> indices_dv(query.size() * max_nn);
+    thrust::device_vector<float> distance2_dv(query.size() * max_nn);
+    flann::Matrix<int> indices_flann(thrust::raw_pointer_cast(indices_dv.data()), query_flann.rows, max_nn);
+    flann::Matrix<float> dists_flann(thrust::raw_pointer_cast(distance2_dv.data()), query_flann.rows, max_nn);
     int k = flann_index_->radiusSearch(query_flann, indices_flann, dists_flann,
                                        float(radius * radius), param);
-    indices.resize(k);
-    distance2.resize(k);
+    ConvertIndicesAndDistaces(max_nn, query.size(), indices_dv, distance2_dv, indices, distance2);
     return k;
 }
 
@@ -140,54 +173,51 @@ bool KDTreeFlann::SetRawData(const Eigen::Map<const Eigen::MatrixXf_u> &data) {
     flann_dataset_.reset(new flann::Matrix<float>(thrust::raw_pointer_cast(data_.data()),
                                                   dataset_size_, dimension_));
     flann_index_.reset(new flann::Index<flann::L2<float>>(
-            *flann_dataset_, flann::KDTreeSingleIndexParams(15)));
+            *flann_dataset_, flann::KDTreeCuda3dIndexParams()));
     flann_index_->buildIndex();
     return true;
 }
 
 template int KDTreeFlann::Search<Eigen::Vector3f>(
-        const Eigen::Vector3f &query,
+        const thrust::device_vector<Eigen::Vector3f> &query,
         const KDTreeSearchParam &param,
-        thrust::device_vector<int> &indices,
-        thrust::device_vector<float> &distance2) const;
+        thrust::device_vector<KNNIndices> &indices,
+        thrust::device_vector<KNNDistances> &distance2) const;
 template int KDTreeFlann::SearchKNN<Eigen::Vector3f>(
-        const Eigen::Vector3f &query,
+        const thrust::device_vector<Eigen::Vector3f> &query,
         int knn,
-        thrust::device_vector<int> &indices,
-        thrust::device_vector<float> &distance2) const;
+        thrust::device_vector<KNNIndices> &indices,
+        thrust::device_vector<KNNDistances> &distance2) const;
 template int KDTreeFlann::SearchRadius<Eigen::Vector3f>(
-        const Eigen::Vector3f &query,
+        const thrust::device_vector<Eigen::Vector3f> &query,
         float radius,
-        thrust::device_vector<int> &indices,
-        thrust::device_vector<float> &distance2) const;
+        thrust::device_vector<KNNIndices> &indices,
+        thrust::device_vector<KNNDistances> &distance2) const;
 template int KDTreeFlann::SearchHybrid<Eigen::Vector3f>(
-        const Eigen::Vector3f &query,
+        const thrust::device_vector<Eigen::Vector3f> &query,
         float radius,
         int max_nn,
-        thrust::device_vector<int> &indices,
-        thrust::device_vector<float> &distance2) const;
+        thrust::device_vector<KNNIndices> &indices,
+        thrust::device_vector<KNNDistances> &distance2) const;
 
 template int KDTreeFlann::Search<Eigen::VectorXf>(
-        const Eigen::VectorXf &query,
+        const thrust::device_vector<Eigen::VectorXf> &query,
         const KDTreeSearchParam &param,
-        thrust::device_vector<int> &indices,
-        thrust::device_vector<float> &distance2) const;
+        thrust::device_vector<KNNIndices> &indices,
+        thrust::device_vector<KNNDistances> &distance2) const;
 template int KDTreeFlann::SearchKNN<Eigen::VectorXf>(
-        const Eigen::VectorXf &query,
+        const thrust::device_vector<Eigen::VectorXf> &query,
         int knn,
-        thrust::device_vector<int> &indices,
-        thrust::device_vector<float> &distance2) const;
+        thrust::device_vector<KNNIndices> &indices,
+        thrust::device_vector<KNNDistances> &distance2) const;
 template int KDTreeFlann::SearchRadius<Eigen::VectorXf>(
-        const Eigen::VectorXf &query,
+        const thrust::device_vector<Eigen::VectorXf> &query,
         float radius,
-        thrust::device_vector<int> &indices,
-        thrust::device_vector<float> &distance2) const;
+        thrust::device_vector<KNNIndices> &indices,
+        thrust::device_vector<KNNDistances> &distance2) const;
 template int KDTreeFlann::SearchHybrid<Eigen::VectorXf>(
-        const Eigen::VectorXf &query,
+        const thrust::device_vector<Eigen::VectorXf> &query,
         float radius,
         int max_nn,
-        thrust::device_vector<int> &indices,
-        thrust::device_vector<float> &distance2) const;
-
-}  // namespace geometry
-}  // namespace cupoc
\ No newline at end of file
+        thrust::device_vector<KNNIndices> &indices,
+        thrust::device_vector<KNNDistances> &distance2) const;
diff --git a/src/cupoc/geometry/kdtree_flann.h b/src/cupoc/geometry/kdtree_flann.h
index 6903f1af..cd618b31 100644
--- a/src/cupoc/geometry/kdtree_flann.h
+++ b/src/cupoc/geometry/kdtree_flann.h
@@ -2,11 +2,11 @@
 
 #include <Eigen/Core>
 #include <memory>
-#include <thrust/device_vector.h>
 
 #include "cupoc/geometry/kdtree_search_param.h"
 #include "cupoc/geometry/pointcloud.h"
 #include "cupoc/utility/eigen.h"
+#include "cupoc/utility/helper.h"
 
 namespace flann {
 template <typename T>
@@ -34,29 +34,29 @@ class KDTreeFlann {
     bool SetGeometry(const PointCloud &geometry);
 
     template <typename T>
-    int Search(const T &query,
+    int Search(const thrust::device_vector<T> &query,
                const KDTreeSearchParam &param,
-               thrust::device_vector<int> &indices,
-               thrust::device_vector<float> &distance2) const;
+               thrust::device_vector<KNNIndices> &indices,
+               thrust::device_vector<KNNDistances> &distance2) const;
 
     template <typename T>
-    int SearchKNN(const T &query,
+    int SearchKNN(const thrust::device_vector<T> &query,
                   int knn,
-                  thrust::device_vector<int> &indices,
-                  thrust::device_vector<float> &distance2) const;
+                  thrust::device_vector<KNNIndices> &indices,
+                  thrust::device_vector<KNNDistances> &distance2) const;
 
     template <typename T>
-    int SearchRadius(const T &query,
+    int SearchRadius(const thrust::device_vector<T> &query,
                      float radius,
-                     thrust::device_vector<int> &indices,
-                     thrust::device_vector<float> &distance2) const;
+                     thrust::device_vector<KNNIndices> &indices,
+                     thrust::device_vector<KNNDistances> &distance2) const;
 
     template <typename T>
-    int SearchHybrid(const T &query,
+    int SearchHybrid(const thrust::device_vector<T> &query,
                      float radius,
                      int max_nn,
-                     thrust::device_vector<int> &indices,
-                     thrust::device_vector<float> &distance2) const;
+                     thrust::device_vector<KNNIndices> &indices,
+                     thrust::device_vector<KNNDistances> &distance2) const;
 
 private:
     bool SetRawData(const Eigen::Map<const Eigen::MatrixXf_u> &data);
diff --git a/src/cupoc/geometry/kdtree_search_param.h b/src/cupoc/geometry/kdtree_search_param.h
index 931a68e9..c9fbf9a1 100644
--- a/src/cupoc/geometry/kdtree_search_param.h
+++ b/src/cupoc/geometry/kdtree_search_param.h
@@ -3,6 +3,10 @@
 namespace cupoc {
 namespace geometry {
 
+static const int NUM_MAX_NN = 50;
+typedef Eigen::Matrix<int, NUM_MAX_NN, 1, Eigen::DontAlign> KNNIndices;
+typedef Eigen::Matrix<float, NUM_MAX_NN, 1, Eigen::DontAlign> KNNDistances;
+
 class KDTreeSearchParam {
 public:
     enum class SearchType {
diff --git a/src/cupoc/geometry/pointcloud.cu b/src/cupoc/geometry/pointcloud.cu
index 6c5afe56..565a3c0c 100644
--- a/src/cupoc/geometry/pointcloud.cu
+++ b/src/cupoc/geometry/pointcloud.cu
@@ -2,9 +2,7 @@
 #include "cupoc/geometry/geometry3d.h"
 #include "cupoc/utility/console.h"
 #include "cupoc/utility/helper.h"
-#include <thrust/for_each.h>
 #include <thrust/gather.h>
-#include <thrust/device_ptr.h>
 
 using namespace cupoc;
 using namespace cupoc::geometry;
@@ -37,46 +35,6 @@ struct cropped_copy_functor {
     }
 };
 
-struct compute_key_functor {
-    compute_key_functor(const Eigen::Vector3f& voxel_min_bound, float voxel_size)
-        : voxel_min_bound_(voxel_min_bound), voxel_size_(voxel_size) {};
-    const Eigen::Vector3f voxel_min_bound_;
-    const float voxel_size_;
-    __device__
-    Eigen::Vector3i operator()(const Eigen::Vector3f_u& pt) {
-        auto ref_coord = (pt - voxel_min_bound_) / voxel_size_;
-        return Eigen::Vector3i(int(floor(ref_coord(0))), int(floor(ref_coord(1))), int(floor(ref_coord(2))));
-    }
-};
-
-template<typename OutputIterator, class... Args>
-__host__
-int CalcAverageByKey(thrust::device_vector<Eigen::Vector3i>& keys,
-                     OutputIterator buf_begins, OutputIterator output_begins) {
-    const size_t n = keys.size();
-    thrust::sort_by_key(keys.begin(), keys.end(), buf_begins);
-
-    thrust::device_vector<Eigen::Vector3i> keys_out(n);
-    thrust::device_vector<int> counts(n);
-    auto end1 = thrust::reduce_by_key(keys.begin(), keys.end(),
-                                      thrust::make_constant_iterator(1),
-                                      keys_out.begin(), counts.begin());
-    int n_out = static_cast<int>(end1.second - counts.begin());
-    counts.resize(n_out);
-
-    thrust::equal_to<Eigen::Vector3i> binary_pred;
-    add_tuple_functor<Args...> add_func;
-    auto end2 = thrust::reduce_by_key(keys.begin(), keys.end(), buf_begins,
-                                      keys_out.begin(), output_begins,
-                                      binary_pred, add_func);
-
-    devided_tuple_functor<Args...> dv_func;
-    thrust::transform(output_begins, output_begins + n_out,
-                      counts.begin(), output_begins,
-                      dv_func);
-    return n_out;
-}
-
 }
 
 PointCloud::PointCloud() {}
@@ -101,89 +59,6 @@ PointCloud& PointCloud::Transform(const Eigen::Matrix4f& transformation) {
     return *this;
 }
 
-utility::shared_ptr<PointCloud> PointCloud::SelectDownSample(const thrust::device_vector<size_t> &indices, bool invert) const {
-    auto output = utility::shared_ptr<PointCloud>(new PointCloud());
-    const bool has_normals = HasNormals();
-    const bool has_colors = HasColors();
-
-    output->points_.resize(indices.size());
-    thrust::gather(indices.begin(), indices.end(), points_.begin(), output->points_.begin());
-    return output;
-}
-
-utility::shared_ptr<PointCloud> PointCloud::VoxelDownSample(float voxel_size) const {
-    auto output = utility::shared_ptr<PointCloud>(new PointCloud());
-    if (voxel_size <= 0.0) {
-        utility::LogWarning("[VoxelDownSample] voxel_size <= 0.\n");
-        return output;
-    }
-
-    const Eigen::Vector3f voxel_size3 = Eigen::Vector3f(voxel_size, voxel_size, voxel_size);
-    const Eigen::Vector3f voxel_min_bound = GetMinBound() - voxel_size3 * 0.5;
-    const Eigen::Vector3f voxel_max_bound = GetMaxBound() + voxel_size3 * 0.5;
-
-    if (voxel_size * std::numeric_limits<int>::max() < (voxel_max_bound - voxel_min_bound).maxCoeff()) {
-        utility::LogWarning("[VoxelDownSample] voxel_size is too small.\n");
-        return output;
-    }
-
-    const int n = points_.size();
-    const bool has_normals = HasNormals();
-    const bool has_colors = HasColors();
-    compute_key_functor ck_func(voxel_min_bound, voxel_size);
-    thrust::device_vector<Eigen::Vector3i> keys(n);
-    thrust::transform(points_.begin(), points_.end(), keys.begin(), ck_func);
-
-    thrust::device_vector<Eigen::Vector3f_u> sorted_points = points_;
-    output->points_.resize(n);
-    if (!has_normals && !has_colors) {
-        typedef thrust::tuple<thrust::device_vector<Eigen::Vector3f_u>::iterator> IteratorTuple;
-        typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-        auto n_out = CalcAverageByKey<ZipIterator, Eigen::Vector3f_u>(keys,
-                    thrust::make_zip_iterator(thrust::make_tuple(sorted_points.begin())),
-                    thrust::make_zip_iterator(thrust::make_tuple(output->points_.begin())));
-        output->points_.resize(n_out);
-    } else if (has_normals && !has_colors) {
-        thrust::device_vector<Eigen::Vector3f_u> sorted_normals = normals_;
-        output->normals_.resize(n);
-        typedef thrust::tuple<thrust::device_vector<Eigen::Vector3f_u>::iterator, thrust::device_vector<Eigen::Vector3f_u>::iterator> IteratorTuple;
-        typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-        auto n_out = CalcAverageByKey<ZipIterator, Eigen::Vector3f_u, Eigen::Vector3f_u>(keys,
-                    thrust::make_zip_iterator(thrust::make_tuple(sorted_points.begin(), sorted_normals.begin())),
-                    thrust::make_zip_iterator(thrust::make_tuple(output->points_.begin(), output->normals_.begin())));
-        output->points_.resize(n_out);
-        output->normals_.resize(n_out);
-    } else if (!has_normals && has_colors) {
-        thrust::device_vector<Eigen::Vector3f_u> sorted_colors = colors_;
-        output->colors_.resize(n);
-        typedef thrust::tuple<thrust::device_vector<Eigen::Vector3f_u>::iterator, thrust::device_vector<Eigen::Vector3f_u>::iterator> IteratorTuple;
-        typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-        auto n_out = CalcAverageByKey<ZipIterator, Eigen::Vector3f_u, Eigen::Vector3f_u>(keys,
-                    thrust::make_zip_iterator(thrust::make_tuple(sorted_points.begin(), sorted_colors.begin())),
-                    thrust::make_zip_iterator(thrust::make_tuple(output->points_.begin(), output->colors_.begin())));
-        output->points_.resize(n_out);
-        output->colors_.resize(n_out);
-    } else {
-        thrust::device_vector<Eigen::Vector3f_u> sorted_normals = normals_;
-        thrust::device_vector<Eigen::Vector3f_u> sorted_colors = colors_;
-        output->normals_.resize(n);
-        output->colors_.resize(n);
-        typedef thrust::tuple<thrust::device_vector<Eigen::Vector3f_u>::iterator, thrust::device_vector<Eigen::Vector3f_u>::iterator, thrust::device_vector<Eigen::Vector3f_u>::iterator> IteratorTuple;
-        typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-        auto n_out = CalcAverageByKey<ZipIterator, Eigen::Vector3f_u, Eigen::Vector3f_u, Eigen::Vector3f_u>(keys,
-                    thrust::make_zip_iterator(thrust::make_tuple(sorted_points.begin(), sorted_normals.begin(), sorted_colors.begin())),
-                    thrust::make_zip_iterator(thrust::make_tuple(output->points_.begin(), output->normals_.begin(), output->colors_.begin())));
-        output->points_.resize(n_out);
-        output->normals_.resize(n_out);
-        output->colors_.resize(n_out);
-    }
-
-    utility::LogDebug(
-            "Pointcloud down sampled from {:d} points to {:d} points.\n",
-            (int)points_.size(), (int)output->points_.size());
-    return output;
-}
-
 utility::shared_ptr<PointCloud> PointCloud::Crop(const Eigen::Vector3f &min_bound,
                                                  const Eigen::Vector3f &max_bound) const {
     auto output = utility::shared_ptr<PointCloud>(new PointCloud());
diff --git a/src/cupoc/geometry/pointcloud.h b/src/cupoc/geometry/pointcloud.h
index 3ec02b29..279a5b91 100644
--- a/src/cupoc/geometry/pointcloud.h
+++ b/src/cupoc/geometry/pointcloud.h
@@ -39,6 +39,7 @@ class PointCloud {
                                          const Eigen::Vector3f &max_bound) const;
 
     bool EstimateNormals(const KDTreeSearchParam& search_param = KDTreeSearchParamKNN());
+    bool OrientNormalsToAlignWithDirection(const Eigen::Vector3f &orientation_reference = Eigen::Vector3f(0.0, 0.0, 1.0));
 
 public:
     thrust::device_vector<Eigen::Vector3f_u> points_;
diff --git a/src/cupoc/io/class_io/pointcloud_io.cpp b/src/cupoc/io/class_io/pointcloud_io.cpp
new file mode 100644
index 00000000..da0cbe32
--- /dev/null
+++ b/src/cupoc/io/class_io/pointcloud_io.cpp
@@ -0,0 +1,104 @@
+#include "cupoc/io/class_io/pointcloud_io.h"
+
+#include <unordered_map>
+
+#include "cupoc/utility/console.h"
+#include "cupoc/utility/filesystem.h"
+
+namespace cupoc {
+
+namespace {
+using namespace io;
+
+static const std::unordered_map<
+        std::string,
+        std::function<bool(const std::string &, geometry::PointCloud &, bool)>>
+        file_extension_to_pointcloud_read_function{
+                {"pcd", ReadPointCloudFromPCD},
+        };
+
+static const std::unordered_map<std::string,
+                                std::function<bool(const std::string &,
+                                                   const geometry::PointCloud &,
+                                                   const bool,
+                                                   const bool,
+                                                   const bool)>>
+        file_extension_to_pointcloud_write_function{
+                {"pcd", WritePointCloudToPCD},
+        };
+}  // unnamed namespace
+
+namespace io {
+
+std::shared_ptr<geometry::PointCloud> CreatePointCloudFromFile(
+        const std::string &filename,
+        const std::string &format,
+        bool print_progress) {
+    auto pointcloud = utility::shapred_ptr<geometry::PointCloud>(new geometry::PointCloud());
+    ReadPointCloud(filename, *pointcloud, format, print_progress);
+    return pointcloud;
+}
+
+bool ReadPointCloud(const std::string &filename,
+                    geometry::PointCloud &pointcloud,
+                    const std::string &format,
+                    bool remove_nan_points,
+                    bool remove_infinite_points,
+                    bool print_progress) {
+    std::string filename_ext;
+    if (format == "auto") {
+        filename_ext =
+                utility::filesystem::GetFileExtensionInLowerCase(filename);
+    } else {
+        filename_ext = format;
+    }
+    if (filename_ext.empty()) {
+        utility::LogWarning(
+                "Read geometry::PointCloud failed: unknown file extension.\n");
+        return false;
+    }
+    auto map_itr =
+            file_extension_to_pointcloud_read_function.find(filename_ext);
+    if (map_itr == file_extension_to_pointcloud_read_function.end()) {
+        utility::LogWarning(
+                "Read geometry::PointCloud failed: unknown file extension.\n");
+        return false;
+    }
+    bool success = map_itr->second(filename, pointcloud, print_progress);
+    utility::LogDebug("Read geometry::PointCloud: {:d} vertices.\n",
+                      (int)pointcloud.points_.size());
+    if (remove_nan_points || remove_infinite_points) {
+        pointcloud.RemoveNoneFinitePoints(remove_nan_points,
+                                          remove_infinite_points);
+    }
+    return success;
+}
+
+bool WritePointCloud(const std::string &filename,
+                     const geometry::PointCloud &pointcloud,
+                     bool write_ascii /* = false*/,
+                     bool compressed /* = false*/,
+                     bool print_progress) {
+    std::string filename_ext =
+            utility::filesystem::GetFileExtensionInLowerCase(filename);
+    if (filename_ext.empty()) {
+        utility::LogWarning(
+                "Write geometry::PointCloud failed: unknown file extension.\n");
+        return false;
+    }
+    auto map_itr =
+            file_extension_to_pointcloud_write_function.find(filename_ext);
+    if (map_itr == file_extension_to_pointcloud_write_function.end()) {
+        utility::LogWarning(
+                "Write geometry::PointCloud failed: unknown file extension.\n");
+        return false;
+    }
+    bool success = map_itr->second(filename, pointcloud, write_ascii,
+                                   compressed, print_progress);
+    utility::LogDebug("Write geometry::PointCloud: {:d} vertices.\n",
+                      (int)pointcloud.points_.size());
+    return success;
+}
+
+}  // namespace io
+}  // namespace open3d
\ No newline at end of file
diff --git a/src/cupoc/io/class_io/pointcloud_io.h b/src/cupoc/io/class_io/pointcloud_io.h
new file mode 100644
index 00000000..688848a0
--- /dev/null
+++ b/src/cupoc/io/class_io/pointcloud_io.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <string>
+
+#include "cupoc/geometry/pointcloud.h"
+#include "cupoc/utility/shared_ptr.hpp"
+
+namespace cupoc {
+namespace io {
+
+/// Factory function to create a pointcloud from a file (PointCloudFactory.cpp)
+/// Return an empty pointcloud if fail to read the file.
+utility::shared_ptr<geometry::PointCloud> CreatePointCloudFromFile(
+        const std::string &filename,
+        const std::string &format = "auto",
+        bool print_progress = false);
+
+/// The general entrance for reading a PointCloud from a file
+/// The function calls read functions based on the extension name of filename.
+/// \return return true if the read function is successful, false otherwise.
+bool ReadPointCloud(const std::string &filename,
+                    geometry::PointCloud &pointcloud,
+                    const std::string &format = "auto",
+                    bool remove_nan_points = true,
+                    bool remove_infinite_points = true,
+                    bool print_progress = false);
+
+/// The general entrance for writing a PointCloud to a file
+/// The function calls write functions based on the extension name of filename.
+/// If the write function supports binary encoding and compression, the later
+/// two parameters will be used. Otherwise they will be ignored.
+/// \return return true if the write function is successful, false otherwise.
+bool WritePointCloud(const std::string &filename,
+                     const geometry::PointCloud &pointcloud,
+                     bool write_ascii = false,
+                     bool compressed = false,
+                     bool print_progress = false);
+
+bool ReadPointCloudFromPCD(const std::string &filename,
+                           geometry::PointCloud &pointcloud,
+                           bool print_progress = false);
+
+bool WritePointCloudToPCD(const std::string &filename,
+                          const geometry::PointCloud &pointcloud,
+                          bool write_ascii = false,
+                          bool compressed = false,
+                          bool print_progress = false);
+
+}  // namespace io
+}  // namespace cupoc
\ No newline at end of file
diff --git a/src/cupoc/io/file_format/file_pcd.cpp b/src/cupoc/io/file_format/file_pcd.cpp
new file mode 100644
index 00000000..9ed913ad
--- /dev/null
+++ b/src/cupoc/io/file_format/file_pcd.cpp
@@ -0,0 +1,755 @@
+#include <liblzf/lzf.h>
+#include <cstdint>
+#include <cstdio>
+#include <sstream>
+
+#include "cupoc/op/class_io/pointcloud_io.h"
+#include "cupoc/utility/console.h"
+#include "cupoc/utility/helper.h"
+
+// References for PCD file IO
+// http://pointclouds.org/documentation/tutorials/pcd_file_format.php
+// https://github.com/PointCloudLibrary/pcl/blob/master/io/src/pcd_io.cpp
+// https://www.mathworks.com/matlabcentral/fileexchange/40382-matlab-to-point-cloud-library
+
+namespace cupoc {
+
+namespace {
+using namespace io;
+
+enum PCDDataType {
+    PCD_DATA_ASCII = 0,
+    PCD_DATA_BINARY = 1,
+    PCD_DATA_BINARY_COMPRESSED = 2
+};
+
+struct PCLPointField {
+public:
+    std::string name;
+    int size;
+    char type;
+    int count;
+    // helper variable
+    int count_offset;
+    int offset;
+};
+
+struct PCDHeader {
+public:
+    std::string version;
+    std::vector<PCLPointField> fields;
+    int width;
+    int height;
+    int points;
+    PCDDataType datatype;
+    std::string viewpoint;
+    // helper variables
+    int elementnum;
+    int pointsize;
+    bool has_points;
+    bool has_normals;
+    bool has_colors;
+};
+
+bool CheckHeader(PCDHeader &header) {
+    if (header.points <= 0 || header.pointsize <= 0) {
+        utility::LogWarning("[CheckHeader] PCD has no data.\n");
+        return false;
+    }
+    if (header.fields.size() == 0 || header.pointsize <= 0) {
+        utility::LogWarning("[CheckHeader] PCD has no fields.\n");
+        return false;
+    }
+    header.has_points = false;
+    header.has_normals = false;
+    header.has_colors = false;
+    bool has_x = false;
+    bool has_y = false;
+    bool has_z = false;
+    bool has_normal_x = false;
+    bool has_normal_y = false;
+    bool has_normal_z = false;
+    bool has_rgb = false;
+    bool has_rgba = false;
+    for (const auto &field : header.fields) {
+        if (field.name == "x") {
+            has_x = true;
+        } else if (field.name == "y") {
+            has_y = true;
+        } else if (field.name == "z") {
+            has_z = true;
+        } else if (field.name == "normal_x") {
+            has_normal_x = true;
+        } else if (field.name == "normal_y") {
+            has_normal_y = true;
+        } else if (field.name == "normal_z") {
+            has_normal_z = true;
+        } else if (field.name == "rgb") {
+            has_rgb = true;
+        } else if (field.name == "rgba") {
+            has_rgba = true;
+        }
+    }
+    header.has_points = (has_x && has_y && has_z);
+    header.has_normals = (has_normal_x && has_normal_y && has_normal_z);
+    header.has_colors = (has_rgb || has_rgba);
+    if (header.has_points == false) {
+        utility::LogWarning(
+                "[CheckHeader] Fields for point data are not complete.\n");
+        return false;
+    }
+    return true;
+}
+
+bool ReadPCDHeader(FILE *file, PCDHeader &header) {
+    char line_buffer[DEFAULT_IO_BUFFER_SIZE];
+    size_t specified_channel_count = 0;
+
+    while (fgets(line_buffer, DEFAULT_IO_BUFFER_SIZE, file)) {
+        std::string line(line_buffer);
+        if (line == "") {
+            continue;
+        }
+        std::vector<std::string> st;
+        utility::SplitString(st, line, "\t\r\n ");
+        std::stringstream sstream(line);
+        sstream.imbue(std::locale::classic());
+        std::string line_type;
+        sstream >> line_type;
+        if (line_type.substr(0, 1) == "#") {
+        } else if (line_type.substr(0, 7) == "VERSION") {
+            if (st.size() >= 2) {
+                header.version = st[1];
+            }
+        } else if (line_type.substr(0, 6) == "FIELDS" ||
+                   line_type.substr(0, 7) == "COLUMNS") {
+            specified_channel_count = st.size() - 1;
+            if (specified_channel_count == 0) {
+                utility::LogWarning("[ReadPCDHeader] Bad PCD file format.\n");
+                return false;
+            }
+            header.fields.resize(specified_channel_count);
+            int count_offset = 0, offset = 0;
+            for (size_t i = 0; i < specified_channel_count;
+                 i++, count_offset += 1, offset += 4) {
+                header.fields[i].name = st[i + 1];
+                header.fields[i].size = 4;
+                header.fields[i].type = 'F';
+                header.fields[i].count = 1;
+                header.fields[i].count_offset = count_offset;
+                header.fields[i].offset = offset;
+            }
+            header.elementnum = count_offset;
+            header.pointsize = offset;
+        } else if (line_type.substr(0, 4) == "SIZE") {
+            if (specified_channel_count != st.size() - 1) {
+                utility::LogWarning("[ReadPCDHeader] Bad PCD file format.\n");
+                return false;
+            }
+            int offset = 0, col_type = 0;
+            for (size_t i = 0; i < specified_channel_count;
+                 i++, offset += col_type) {
+                sstream >> col_type;
+                header.fields[i].size = col_type;
+                header.fields[i].offset = offset;
+            }
+            header.pointsize = offset;
+        } else if (line_type.substr(0, 4) == "TYPE") {
+            if (specified_channel_count != st.size() - 1) {
+                utility::LogWarning("[ReadPCDHeader] Bad PCD file format.\n");
+                return false;
+            }
+            for (size_t i = 0; i < specified_channel_count; i++) {
+                header.fields[i].type = st[i + 1].c_str()[0];
+            }
+        } else if (line_type.substr(0, 5) == "COUNT") {
+            if (specified_channel_count != st.size() - 1) {
+                utility::LogWarning("[ReadPCDHeader] Bad PCD file format.\n");
+                return false;
+            }
+            int count_offset = 0, offset = 0, col_count = 0;
+            for (size_t i = 0; i < specified_channel_count; i++) {
+                sstream >> col_count;
+                header.fields[i].count = col_count;
+                header.fields[i].count_offset = count_offset;
+                header.fields[i].offset = offset;
+                count_offset += col_count;
+                offset += col_count * header.fields[i].size;
+            }
+            header.elementnum = count_offset;
+            header.pointsize = offset;
+        } else if (line_type.substr(0, 5) == "WIDTH") {
+            sstream >> header.width;
+        } else if (line_type.substr(0, 6) == "HEIGHT") {
+            sstream >> header.height;
+            header.points = header.width * header.height;
+        } else if (line_type.substr(0, 9) == "VIEWPOINT") {
+            if (st.size() >= 2) {
+                header.viewpoint = st[1];
+            }
+        } else if (line_type.substr(0, 6) == "POINTS") {
+            sstream >> header.points;
+        } else if (line_type.substr(0, 4) == "DATA") {
+            header.datatype = PCD_DATA_ASCII;
+            if (st.size() >= 2) {
+                if (st[1].substr(0, 17) == "binary_compressed") {
+                    header.datatype = PCD_DATA_BINARY_COMPRESSED;
+                } else if (st[1].substr(0, 6) == "binary") {
+                    header.datatype = PCD_DATA_BINARY;
+                }
+            }
+            break;
+        }
+    }
+    if (CheckHeader(header) == false) {
+        return false;
+    }
+    return true;
+}
+
+float UnpackBinaryPCDElement(const char *data_ptr,
+                             const char type,
+                             const int size) {
+    if (type == 'I') {
+        if (size == 1) {
+            std::int8_t data;
+            memcpy(&data, data_ptr, sizeof(data));
+            return (float)data;
+        } else if (size == 2) {
+            std::int16_t data;
+            memcpy(&data, data_ptr, sizeof(data));
+            return (float)data;
+        } else if (size == 4) {
+            std::int32_t data;
+            memcpy(&data, data_ptr, sizeof(data));
+            return (float)data;
+        } else {
+            return 0.0;
+        }
+    } else if (type == 'U') {
+        if (size == 1) {
+            std::uint8_t data;
+            memcpy(&data, data_ptr, sizeof(data));
+            return (float)data;
+        } else if (size == 2) {
+            std::uint16_t data;
+            memcpy(&data, data_ptr, sizeof(data));
+            return (float)data;
+        } else if (size == 4) {
+            std::uint32_t data;
+            memcpy(&data, data_ptr, sizeof(data));
+            return (float)data;
+        } else {
+            return 0.0;
+        }
+    } else if (type == 'F') {
+        if (size == 4) {
+            std::float_t data;
+            memcpy(&data, data_ptr, sizeof(data));
+            return (float)data;
+        } else {
+            return 0.0;
+        }
+    }
+    return 0.0;
+}
+
+Eigen::Vector3f UnpackBinaryPCDColor(const char *data_ptr,
+                                     const char type,
+                                     const int size) {
+    if (size == 4) {
+        std::uint8_t data[4];
+        memcpy(data, data_ptr, 4);
+        // color data is packed in BGR order.
+        return Eigen::Vector3f((float)data[2] / 255.0, (float)data[1] / 255.0,
+                               (float)data[0] / 255.0);
+    } else {
+        return Eigen::Vector3f::Zero();
+    }
+}
+
+float UnpackASCIIPCDElement(const char *data_ptr,
+                            const char type,
+                            const int size) {
+    char *end;
+    if (type == 'I') {
+        return (float)std::strtol(data_ptr, &end, 0);
+    } else if (type == 'U') {
+        return (float)std::strtoul(data_ptr, &end, 0);
+    } else if (type == 'F') {
+        return std::strtod(data_ptr, &end);
+    }
+    return 0.0;
+}
+
+Eigen::Vector3f UnpackASCIIPCDColor(const char *data_ptr,
+                                    const char type,
+                                    const int size) {
+    if (size == 4) {
+        std::uint8_t data[4] = {0, 0, 0, 0};
+        char *end;
+        if (type == 'I') {
+            std::int32_t value = std::strtol(data_ptr, &end, 0);
+            memcpy(data, &value, 4);
+        } else if (type == 'U') {
+            std::uint32_t value = std::strtoul(data_ptr, &end, 0);
+            memcpy(data, &value, 4);
+        } else if (type == 'F') {
+            std::float_t value = std::strtof(data_ptr, &end);
+            memcpy(data, &value, 4);
+        }
+        return Eigen::Vector3f((float)data[2] / 255.0, (float)data[1] / 255.0,
+                               (float)data[0] / 255.0);
+    } else {
+        return Eigen::Vector3f::Zero();
+    }
+}
+
+bool ReadPCDData(FILE *file,
+                 const PCDHeader &header,
+                 geometry::PointCloud &pointcloud) {
+    // The header should have been checked
+    if (header.has_points) {
+        pointcloud.points_.resize(header.points);
+    } else {
+        utility::LogWarning(
+                "[ReadPCDData] Fields for point data are not complete.\n");
+        return false;
+    }
+    if (header.has_normals) {
+        pointcloud.normals_.resize(header.points);
+    }
+    if (header.has_colors) {
+        pointcloud.colors_.resize(header.points);
+    }
+    if (header.datatype == PCD_DATA_ASCII) {
+        char line_buffer[DEFAULT_IO_BUFFER_SIZE];
+        int idx = 0;
+        while (fgets(line_buffer, DEFAULT_IO_BUFFER_SIZE, file) &&
+               idx < header.points) {
+            std::string line(line_buffer);
+            std::vector<std::string> strs;
+            utility::SplitString(strs, line, "\t\r\n ");
+            if ((int)strs.size() < header.elementnum) {
+                continue;
+            }
+            for (size_t i = 0; i < header.fields.size(); i++) {
+                const auto &field = header.fields[i];
+                if (field.name == "x") {
+                    pointcloud.points_[idx](0) = UnpackASCIIPCDElement(
+                            strs[field.count_offset].c_str(), field.type,
+                            field.size);
+                } else if (field.name == "y") {
+                    pointcloud.points_[idx](1) = UnpackASCIIPCDElement(
+                            strs[field.count_offset].c_str(), field.type,
+                            field.size);
+                } else if (field.name == "z") {
+                    pointcloud.points_[idx](2) = UnpackASCIIPCDElement(
+                            strs[field.count_offset].c_str(), field.type,
+                            field.size);
+                } else if (field.name == "normal_x") {
+                    pointcloud.normals_[idx](0) = UnpackASCIIPCDElement(
+                            strs[field.count_offset].c_str(), field.type,
+                            field.size);
+                } else if (field.name == "normal_y") {
+                    pointcloud.normals_[idx](1) = UnpackASCIIPCDElement(
+                            strs[field.count_offset].c_str(), field.type,
+                            field.size);
+                } else if (field.name == "normal_z") {
+                    pointcloud.normals_[idx](2) = UnpackASCIIPCDElement(
+                            strs[field.count_offset].c_str(), field.type,
+                            field.size);
+                } else if (field.name == "rgb" || field.name == "rgba") {
+                    pointcloud.colors_[idx] = UnpackASCIIPCDColor(
+                            strs[field.count_offset].c_str(), field.type,
+                            field.size);
+                }
+            }
+            idx++;
+        }
+    } else if (header.datatype == PCD_DATA_BINARY) {
+        std::unique_ptr<char[]> buffer(new char[header.pointsize]);
+        for (int i = 0; i < header.points; i++) {
+            if (fread(buffer.get(), header.pointsize, 1, file) != 1) {
+                utility::LogWarning(
+                        "[ReadPCDData] Failed to read data record.\n");
+                pointcloud.Clear();
+                return false;
+            }
+            for (const auto &field : header.fields) {
+                if (field.name == "x") {
+                    pointcloud.points_[i](0) =
+                            UnpackBinaryPCDElement(buffer.get() + field.offset,
+                                                   field.type, field.size);
+                } else if (field.name == "y") {
+                    pointcloud.points_[i](1) =
+                            UnpackBinaryPCDElement(buffer.get() + field.offset,
+                                                   field.type, field.size);
+                } else if (field.name == "z") {
+                    pointcloud.points_[i](2) =
+                            UnpackBinaryPCDElement(buffer.get() + field.offset,
+                                                   field.type, field.size);
+                } else if (field.name == "normal_x") {
+                    pointcloud.normals_[i](0) =
+                            UnpackBinaryPCDElement(buffer.get() + field.offset,
+                                                   field.type, field.size);
+                } else if (field.name == "normal_y") {
+                    pointcloud.normals_[i](1) =
+                            UnpackBinaryPCDElement(buffer.get() + field.offset,
+                                                   field.type, field.size);
+                } else if (field.name == "normal_z") {
+                    pointcloud.normals_[i](2) =
+                            UnpackBinaryPCDElement(buffer.get() + field.offset,
+                                                   field.type, field.size);
+                } else if (field.name == "rgb" || field.name == "rgba") {
+                    pointcloud.colors_[i] =
+                            UnpackBinaryPCDColor(buffer.get() + field.offset,
+                                                 field.type, field.size);
+                }
+            }
+        }
+    } else if (header.datatype == PCD_DATA_BINARY_COMPRESSED) {
+        std::uint32_t compressed_size;
+        std::uint32_t uncompressed_size;
+        if (fread(&compressed_size, sizeof(compressed_size), 1, file) != 1) {
+            utility::LogWarning("[ReadPCDData] Failed to read data record.\n");
+            pointcloud.Clear();
+            return false;
+        }
+        if (fread(&uncompressed_size, sizeof(uncompressed_size), 1, file) !=
+            1) {
+            utility::LogWarning("[ReadPCDData] Failed to read data record.\n");
+            pointcloud.Clear();
+            return false;
+        }
+        utility::LogWarning(
+                "PCD data with {:d} compressed size, and {:d} uncompressed "
+                "size.\n",
+                compressed_size, uncompressed_size);
+        std::unique_ptr<char[]> buffer_compressed(new char[compressed_size]);
+        if (fread(buffer_compressed.get(), 1, compressed_size, file) !=
+            compressed_size) {
+            utility::LogWarning("[ReadPCDData] Failed to read data record.\n");
+            pointcloud.Clear();
+            return false;
+        }
+        std::unique_ptr<char[]> buffer(new char[uncompressed_size]);
+        if (lzf_decompress(buffer_compressed.get(),
+                           (unsigned int)compressed_size, buffer.get(),
+                           (unsigned int)uncompressed_size) !=
+            uncompressed_size) {
+            utility::LogWarning("[ReadPCDData] Uncompression failed.\n");
+            pointcloud.Clear();
+            return false;
+        }
+        for (const auto &field : header.fields) {
+            const char *base_ptr = buffer.get() + field.offset * header.points;
+            if (field.name == "x") {
+                for (int i = 0; i < header.points; i++) {
+                    pointcloud.points_[i](0) = UnpackBinaryPCDElement(
+                            base_ptr + i * field.size * field.count, field.type,
+                            field.size);
+                }
+            } else if (field.name == "y") {
+                for (int i = 0; i < header.points; i++) {
+                    pointcloud.points_[i](1) = UnpackBinaryPCDElement(
+                            base_ptr + i * field.size * field.count, field.type,
+                            field.size);
+                }
+            } else if (field.name == "z") {
+                for (int i = 0; i < header.points; i++) {
+                    pointcloud.points_[i](2) = UnpackBinaryPCDElement(
+                            base_ptr + i * field.size * field.count, field.type,
+                            field.size);
+                }
+            } else if (field.name == "normal_x") {
+                for (int i = 0; i < header.points; i++) {
+                    pointcloud.normals_[i](0) = UnpackBinaryPCDElement(
+                            base_ptr + i * field.size * field.count, field.type,
+                            field.size);
+                }
+            } else if (field.name == "normal_y") {
+                for (int i = 0; i < header.points; i++) {
+                    pointcloud.normals_[i](1) = UnpackBinaryPCDElement(
+                            base_ptr + i * field.size * field.count, field.type,
+                            field.size);
+                }
+            } else if (field.name == "normal_z") {
+                for (int i = 0; i < header.points; i++) {
+                    pointcloud.normals_[i](2) = UnpackBinaryPCDElement(
+                            base_ptr + i * field.size * field.count, field.type,
+                            field.size);
+                }
+            } else if (field.name == "rgb" || field.name == "rgba") {
+                for (int i = 0; i < header.points; i++) {
+                    pointcloud.colors_[i] = UnpackBinaryPCDColor(
+                            base_ptr + i * field.size * field.count, field.type,
+                            field.size);
+                }
+            }
+        }
+    }
+    return true;
+}
+
+bool GenerateHeader(const geometry::PointCloud &pointcloud,
+                    const bool write_ascii,
+                    const bool compressed,
+                    PCDHeader &header) {
+    if (pointcloud.HasPoints() == false) {
+        return false;
+    }
+    header.version = "0.7";
+    header.width = (int)pointcloud.points_.size();
+    header.height = 1;
+    header.points = header.width;
+    header.fields.clear();
+    PCLPointField field;
+    field.type = 'F';
+    field.size = 4;
+    field.count = 1;
+    field.name = "x";
+    header.fields.push_back(field);
+    field.name = "y";
+    header.fields.push_back(field);
+    field.name = "z";
+    header.fields.push_back(field);
+    header.elementnum = 3;
+    header.pointsize = 12;
+    if (pointcloud.HasNormals()) {
+        field.name = "normal_x";
+        header.fields.push_back(field);
+        field.name = "normal_y";
+        header.fields.push_back(field);
+        field.name = "normal_z";
+        header.fields.push_back(field);
+        header.elementnum += 3;
+        header.pointsize += 12;
+    }
+    if (pointcloud.HasColors()) {
+        field.name = "rgb";
+        header.fields.push_back(field);
+        header.elementnum++;
+        header.pointsize += 4;
+    }
+    if (write_ascii) {
+        header.datatype = PCD_DATA_ASCII;
+    } else {
+        if (compressed) {
+            header.datatype = PCD_DATA_BINARY_COMPRESSED;
+        } else {
+            header.datatype = PCD_DATA_BINARY;
+        }
+    }
+    return true;
+}
+
+bool WritePCDHeader(FILE *file, const PCDHeader &header) {
+    fprintf(file, "# .PCD v%s - Point Cloud Data file format\n",
+            header.version.c_str());
+    fprintf(file, "VERSION %s\n", header.version.c_str());
+    fprintf(file, "FIELDS");
+    for (const auto &field : header.fields) {
+        fprintf(file, " %s", field.name.c_str());
+    }
+    fprintf(file, "\n");
+    fprintf(file, "SIZE");
+    for (const auto &field : header.fields) {
+        fprintf(file, " %d", field.size);
+    }
+    fprintf(file, "\n");
+    fprintf(file, "TYPE");
+    for (const auto &field : header.fields) {
+        fprintf(file, " %c", field.type);
+    }
+    fprintf(file, "\n");
+    fprintf(file, "COUNT");
+    for (const auto &field : header.fields) {
+        fprintf(file, " %d", field.count);
+    }
+    fprintf(file, "\n");
+    fprintf(file, "WIDTH %d\n", header.width);
+    fprintf(file, "HEIGHT %d\n", header.height);
+    fprintf(file, "VIEWPOINT 0 0 0 1 0 0 0\n");
+    fprintf(file, "POINTS %d\n", header.points);
+
+    switch (header.datatype) {
+        case PCD_DATA_BINARY:
+            fprintf(file, "DATA binary\n");
+            break;
+        case PCD_DATA_BINARY_COMPRESSED:
+            fprintf(file, "DATA binary_compressed\n");
+            break;
+        case PCD_DATA_ASCII:
+        default:
+            fprintf(file, "DATA ascii\n");
+            break;
+    }
+    return true;
+}
+
+float ConvertRGBToFloat(const Eigen::Vector3f &color) {
+    std::uint8_t rgba[4] = {0, 0, 0, 0};
+    rgba[2] = (std::uint8_t)std::max(std::min((int)(color(0) * 255.0), 255), 0);
+    rgba[1] = (std::uint8_t)std::max(std::min((int)(color(1) * 255.0), 255), 0);
+    rgba[0] = (std::uint8_t)std::max(std::min((int)(color(2) * 255.0), 255), 0);
+    float value;
+    memcpy(&value, rgba, 4);
+    return value;
+}
+
+bool WritePCDData(FILE *file,
+                  const PCDHeader &header,
+                  const geometry::PointCloud &pointcloud) {
+    bool has_normal = pointcloud.HasNormals();
+    bool has_color = pointcloud.HasColors();
+    if (header.datatype == PCD_DATA_ASCII) {
+        for (size_t i = 0; i < pointcloud.points_.size(); i++) {
+            const auto &point = pointcloud.points_[i];
+            fprintf(file, "%.10g %.10g %.10g", point(0), point(1), point(2));
+            if (has_normal) {
+                const auto &normal = pointcloud.normals_[i];
+                fprintf(file, " %.10g %.10g %.10g", normal(0), normal(1),
+                        normal(2));
+            }
+            if (has_color) {
+                const auto &color = pointcloud.colors_[i];
+                fprintf(file, " %.10g", ConvertRGBToFloat(color));
+            }
+            fprintf(file, "\n");
+        }
+    } else if (header.datatype == PCD_DATA_BINARY) {
+        std::unique_ptr<float[]> data(new float[header.elementnum]);
+        for (size_t i = 0; i < pointcloud.points_.size(); i++) {
+            const auto &point = pointcloud.points_[i];
+            data[0] = (float)point(0);
+            data[1] = (float)point(1);
+            data[2] = (float)point(2);
+            int idx = 3;
+            if (has_normal) {
+                const auto &normal = pointcloud.normals_[i];
+                data[idx + 0] = (float)normal(0);
+                data[idx + 1] = (float)normal(1);
+                data[idx + 2] = (float)normal(2);
+                idx += 3;
+            }
+            if (has_color) {
+                const auto &color = pointcloud.colors_[i];
+                data[idx] = ConvertRGBToFloat(color);
+            }
+            fwrite(data.get(), sizeof(float), header.elementnum, file);
+        }
+    } else if (header.datatype == PCD_DATA_BINARY_COMPRESSED) {
+        int strip_size = header.points;
+        std::uint32_t buffer_size =
+                (std::uint32_t)(header.elementnum * header.points);
+        std::unique_ptr<float[]> buffer(new float[buffer_size]);
+        std::unique_ptr<float[]> buffer_compressed(new float[buffer_size * 2]);
+        for (size_t i = 0; i < pointcloud.points_.size(); i++) {
+            const auto &point = pointcloud.points_[i];
+            buffer[0 * strip_size + i] = (float)point(0);
+            buffer[1 * strip_size + i] = (float)point(1);
+            buffer[2 * strip_size + i] = (float)point(2);
+            int idx = 3;
+            if (has_normal) {
+                const auto &normal = pointcloud.normals_[i];
+                buffer[(idx + 0) * strip_size + i] = (float)normal(0);
+                buffer[(idx + 1) * strip_size + i] = (float)normal(1);
+                buffer[(idx + 2) * strip_size + i] = (float)normal(2);
+                idx += 3;
+            }
+            if (has_color) {
+                const auto &color = pointcloud.colors_[i];
+                buffer[idx * strip_size + i] = ConvertRGBToFloat(color);
+            }
+        }
+        std::uint32_t buffer_size_in_bytes = buffer_size * sizeof(float);
+        std::uint32_t size_compressed =
+                lzf_compress(buffer.get(), buffer_size_in_bytes,
+                             buffer_compressed.get(), buffer_size_in_bytes * 2);
+        if (size_compressed == 0) {
+            utility::LogWarning("[WritePCDData] Failed to compress data.\n");
+            return false;
+        }
+        utility::LogDebug(
+                "[WritePCDData] {:d} bytes data compressed into {:d} bytes.\n",
+                buffer_size_in_bytes, size_compressed);
+        fwrite(&size_compressed, sizeof(size_compressed), 1, file);
+        fwrite(&buffer_size_in_bytes, sizeof(buffer_size_in_bytes), 1, file);
+        fwrite(buffer_compressed.get(), 1, size_compressed, file);
+    }
+    return true;
+}
+
+}  // unnamed namespace
+
+namespace io {
+bool ReadPointCloudFromPCD(const std::string &filename,
+                           geometry::PointCloud &pointcloud,
+                           bool print_progress) {
+    PCDHeader header;
+    FILE *file = fopen(filename.c_str(), "rb");
+    if (file == NULL) {
+        utility::LogWarning("Read PCD failed: unable to open file: {}\n",
+                            filename);
+        return false;
+    }
+    if (ReadPCDHeader(file, header) == false) {
+        utility::LogWarning("Read PCD failed: unable to parse header.\n");
+        fclose(file);
+        return false;
+    }
+    utility::LogDebug(
+            "PCD header indicates {:d} fields, {:d} bytes per point, and {:d} "
+            "points "
+            "in total.\n",
+            (int)header.fields.size(), header.pointsize, header.points);
+    for (const auto &field : header.fields) {
+        utility::LogDebug("{}, {}, {:d}, {:d}, {:d}\n", field.name.c_str(),
+                          field.type, field.size, field.count, field.offset);
+    }
+    utility::LogDebug("Compression method is {:d}.\n", (int)header.datatype);
+    utility::LogDebug("Points: {};  normals: {};  colors: {}\n",
+                      header.has_points ? "yes" : "no",
+                      header.has_normals ? "yes" : "no",
+                      header.has_colors ? "yes" : "no");
+    if (ReadPCDData(file, header, pointcloud) == false) {
+        utility::LogWarning("Read PCD failed: unable to read data.\n");
+        fclose(file);
+        return false;
+    }
+    fclose(file);
+    return true;
+}
+
+bool WritePointCloudToPCD(const std::string &filename,
+                          const geometry::PointCloud &pointcloud,
+                          bool write_ascii /* = false*/,
+                          bool compressed /* = false*/,
+                          bool print_progress) {
+    PCDHeader header;
+    if (GenerateHeader(pointcloud, write_ascii, compressed, header) == false) {
+        utility::LogWarning("Write PCD failed: unable to generate header.\n");
+        return false;
+    }
+    FILE *file = fopen(filename.c_str(), "wb");
+    if (file == NULL) {
+        utility::LogWarning("Write PCD failed: unable to open file.\n");
+        return false;
+    }
+    if (WritePCDHeader(file, header) == false) {
+        utility::LogWarning("Write PCD failed: unable to write header.\n");
+        fclose(file);
+        return false;
+    }
+    if (WritePCDData(file, header, pointcloud) == false) {
+        utility::LogWarning("Write PCD failed: unable to write data.\n");
+        fclose(file);
+        return false;
+    }
+    fclose(file);
+    return true;
+}
+
+}  // namespace io
+}  // namespace cupoc
\ No newline at end of file
diff --git a/src/cupoc/utility/filesystem.cpp b/src/cupoc/utility/filesystem.cpp
new file mode 100644
index 00000000..caae3371
--- /dev/null
+++ b/src/cupoc/utility/filesystem.cpp
@@ -0,0 +1,25 @@
+#include "cupoc/utility/filesystem.h"
+
+#include <algorithm>
+
+namespace cupoc {
+namespace utility {
+namespace filesystem {
+
+std::string GetFileExtensionInLowerCase(const std::string &filename) {
+    size_t dot_pos = filename.find_last_of(".");
+    if (dot_pos >= filename.length()) return "";
+
+    if (filename.find_first_of("/\\", dot_pos) != std::string::npos) return "";
+
+    std::string filename_ext = filename.substr(dot_pos + 1);
+
+    std::transform(filename_ext.begin(), filename_ext.end(),
+                   filename_ext.begin(), ::tolower);
+
+    return filename_ext;
+}
+
+}
+}
+}
\ No newline at end of file
diff --git a/src/cupoc/utility/filesystem.h b/src/cupoc/utility/filesystem.h
new file mode 100644
index 00000000..336079e5
--- /dev/null
+++ b/src/cupoc/utility/filesystem.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace cupoc {
+namespace utility {
+namespace filesystem {
+
+std::string GetFileExtensionInLowerCase(const std::string &filename);
+
+}
+}
+}
\ No newline at end of file
diff --git a/src/cupoc/utility/helper.h b/src/cupoc/utility/helper.h
index ca8f29b1..11ba0bcc 100644
--- a/src/cupoc/utility/helper.h
+++ b/src/cupoc/utility/helper.h
@@ -1,6 +1,10 @@
 #pragma once
 #include <Eigen/Core>
 #include <thrust/functional.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
 
 namespace thrust {
 
@@ -15,6 +19,54 @@ struct equal_to<Eigen::Vector3i> {
     }
 };
 
+template <typename Iterator>
+class strided_range
+{
+    public:
+
+    typedef typename thrust::iterator_difference<Iterator>::type difference_type;
+
+    struct stride_functor : public thrust::unary_function<difference_type, difference_type>
+    {
+        difference_type stride;
+
+        stride_functor(difference_type stride)
+            : stride(stride) {}
+
+        __host__ __device__
+        difference_type operator()(const difference_type& i) const
+        { 
+            return stride * i;
+        }
+    };
+
+    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
+    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
+    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
+
+    // type of the strided_range iterator
+    typedef PermutationIterator iterator;
+
+    // construct strided_range for the range [first,last)
+    strided_range(Iterator first, Iterator last, difference_type stride)
+        : first(first), last(last), stride(stride) {}
+   
+    iterator begin(void) const
+    {
+        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
+    }
+
+    iterator end(void) const
+    {
+        return begin() + ((last - first) + (stride - 1)) / stride;
+    }
+    
+    protected:
+    Iterator first;
+    Iterator last;
+    difference_type stride;
+};
+
 }
 
 namespace cupoc {
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 36fbbee9..ef64486c 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -31,16 +31,55 @@ if (BUILD_EIGEN3)
     set(EIGEN3_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/third_party/eigen")
 endif ()
 
-set(flann_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp)
+set(flann_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/flann/src/cpp)
+
+file(GLOB_RECURSE C_SOURCES flann/src/cpp/flann/flann.cpp)
+file(GLOB_RECURSE CPP_SOURCES flann/src/cpp/flann/flann_cpp.cpp)
+file(GLOB_RECURSE CU_SOURCES flann/src/cpp/flann/*.cu)
+
+add_library(flann_cpp_s STATIC ${CPP_SOURCES})
+target_include_directories(flann_cpp_s PUBLIC ${flann_INCLUDE_DIRS})
+target_link_libraries(flann_cpp_s ${LZ4_LIBRARIES})
+set_property(TARGET flann_cpp_s PROPERTY COMPILE_DEFINITIONS FLANN_STATIC FLANN_USE_CUDA)
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DFLANN_USE_CUDA")
+cuda_add_library(flann_cuda_s STATIC ${CU_SOURCES})
+target_include_directories(flann_cuda_s PUBLIC ${flann_INCLUDE_DIRS})
+set_property(TARGET flann_cuda_s PROPERTY COMPILE_DEFINITIONS FLANN_STATIC)
+
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_COMPILER_IS_GNUCC)
+    add_library(flann_cpp SHARED "")
+    set_target_properties(flann_cpp PROPERTIES LINKER_LANGUAGE CXX)
+    target_link_libraries(flann_cpp -Wl,-whole-archive flann_cpp_s -Wl,-no-whole-archive)
+    cuda_add_library(flann_cuda SHARED "")
+    set_target_properties(flann_cuda PROPERTIES LINKER_LANGUAGE CXX)
+    target_link_libraries(flann_cuda -Wl,-whole-archive flann_cuda_s -Wl,-no-whole-archive)
+    set_property(TARGET flann_cpp_s PROPERTY COMPILE_DEFINITIONS FLANN_USE_CUDA)
+else()
+    add_library(flann_cpp SHARED ${CPP_SOURCES})
+    target_link_libraries(flann_cpp -Wl,--push-state,--no-as-needed ${LZ4_LIBRARIES} -Wl,--pop-state)
+    # export lz4 headers, so that MSVC to creates flann_cpp.lib
+    set_target_properties(flann_cpp PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES)
+    cuda_add_library(flann_cuda SHARED ${CPP_SOURCES})
+    set_property(TARGET flann_cpp PROPERTY COMPILE_DEFINITIONS FLANN_USE_CUDA)
+endif()
+
 
-include_directories(fmt/include)
 cuda_add_library(fmt fmt/src/format.cc fmt/src/posix.cc)
+target_include_directories(fmt PUBLIC fmt/include)
 set(fmt_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/fmt/include)
 
+# liblzf
+file(GLOB LIBLZF_SOURCE_FILES "liblzf/*.c")
+add_library(liblzf ${LIBLZF_SOURCE_FILES})
+target_include_directories(liblzf PUBLIC liblzf)
+set(liblzf_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/liblzf)
+
 list(APPEND 3RDPARTY_INCLUDE_DIRS
      ${EIGEN3_INCLUDE_DIRS}
      ${flann_INCLUDE_DIRS}
      ${fmt_INCLUDE_DIRS}
+     ${liblzf_INCLUDE_DIRS}
 )
 
 set(3RDPARTY_INCLUDE_DIRS ${3RDPARTY_INCLUDE_DIRS} PARENT_SCOPE)
\ No newline at end of file
diff --git a/third_party/liblzf/CMakeLists.txt b/third_party/liblzf/CMakeLists.txt
new file mode 100644
index 00000000..b4d5ccde
--- /dev/null
+++ b/third_party/liblzf/CMakeLists.txt
@@ -0,0 +1,6 @@
+# liblzf is only used in IO. It is thus included in IO.
+# If you want to use liblzf as an independent library, uncomment the following
+# lines.
+#
+#project(liblzf)
+#add_library(liblzf STATIC lzf_c.c lzf_d.c lzf.h)
diff --git a/third_party/liblzf/lzf.h b/third_party/liblzf/lzf.h
new file mode 100644
index 00000000..269ae746
--- /dev/null
+++ b/third_party/liblzf/lzf.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de>
+ * 
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ * 
+ *   1.  Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ * 
+ *   2.  Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ */
+
+#ifndef LZF_H
+#define LZF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***********************************************************************
+**
+**	lzf -- an extremely fast/free compression/decompression-method
+**	http://liblzf.plan9.de/
+**
+**	This algorithm is believed to be patent-free.
+**
+***********************************************************************/
+
+#define LZF_VERSION 0x0105 /* 1.5, API version */
+
+/*
+ * Compress in_len bytes stored at the memory block starting at
+ * in_data and write the result to out_data, up to a maximum length
+ * of out_len bytes.
+ *
+ * If the output buffer is not large enough or any error occurs return 0,
+ * otherwise return the number of bytes used, which might be considerably
+ * more than in_len (but less than 104% of the original size), so it
+ * makes sense to always use out_len == in_len - 1), to ensure _some_
+ * compression, and store the data uncompressed otherwise (with a flag, of
+ * course.
+ *
+ * lzf_compress might use different algorithms on different systems and
+ * even different runs, thus might result in different compressed strings
+ * depending on the phase of the moon or similar factors. However, all
+ * these strings are architecture-independent and will result in the
+ * original data when decompressed using lzf_decompress.
+ *
+ * The buffers must not be overlapping.
+ *
+ * If the option LZF_STATE_ARG is enabled, an extra argument must be
+ * supplied which is not reflected in this header file. Refer to lzfP.h
+ * and lzf_c.c.
+ *
+ */
+unsigned int 
+lzf_compress (const void *const in_data,  unsigned int in_len,
+              void             *out_data, unsigned int out_len);
+
+/*
+ * Decompress data compressed with some version of the lzf_compress
+ * function and stored at location in_data and length in_len. The result
+ * will be stored at out_data up to a maximum of out_len characters.
+ *
+ * If the output buffer is not large enough to hold the decompressed
+ * data, a 0 is returned and errno is set to E2BIG. Otherwise the number
+ * of decompressed bytes (i.e. the original length of the data) is
+ * returned.
+ *
+ * If an error in the compressed data is detected, a zero is returned and
+ * errno is set to EINVAL.
+ *
+ * This function is very fast, about as fast as a copying loop.
+ */
+unsigned int 
+lzf_decompress (const void *const in_data,  unsigned int in_len,
+                void             *out_data, unsigned int out_len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/third_party/liblzf/lzfP.h b/third_party/liblzf/lzfP.h
new file mode 100644
index 00000000..cf5678ae
--- /dev/null
+++ b/third_party/liblzf/lzfP.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2000-2007 Marc Alexander Lehmann <schmorp@schmorp.de>
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ *   1.  Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *
+ *   2.  Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ */
+
+#ifndef LZFP_h
+#define LZFP_h
+
+//#define STANDALONE 1 /* at the moment, this is ok. */
+
+#ifndef STANDALONE
+# include "lzf.h"
+#endif
+
+/*
+ * Size of hashtable is (1 << HLOG) * sizeof (char *)
+ * decompression is independent of the hash table size
+ * the difference between 15 and 14 is very small
+ * for small blocks (and 14 is usually a bit faster).
+ * For a low-memory/faster configuration, use HLOG == 13;
+ * For best compression, use 15 or 16 (or more, up to 22).
+ */
+#ifndef HLOG
+# define HLOG 16
+#endif
+
+/*
+ * Sacrifice very little compression quality in favour of compression speed.
+ * This gives almost the same compression as the default code, and is
+ * (very roughly) 15% faster. This is the preferred mode of operation.
+ */
+#ifndef VERY_FAST
+# define VERY_FAST 1
+#endif
+
+/*
+ * Sacrifice some more compression quality in favour of compression speed.
+ * (roughly 1-2% worse compression for large blocks and
+ * 9-10% for small, redundant, blocks and >>20% better speed in both cases)
+ * In short: when in need for speed, enable this for binary data,
+ * possibly disable this for text data.
+ */
+#ifndef ULTRA_FAST
+# define ULTRA_FAST 0
+#endif
+
+/*
+ * Unconditionally aligning does not cost very much, so do it if unsure
+ */
+#ifndef STRICT_ALIGN
+// # define STRICT_ALIGN !(defined(__i386) || defined (__amd64))
+#if !(defined(__i386) || defined (__amd64))
+# define STRICT_ALIGN 1
+#else
+# define STRICT_ALIGN 0
+#endif
+#endif
+
+/*
+ * You may choose to pre-set the hash table (might be faster on some
+ * modern cpus and large (>>64k) blocks, and also makes compression
+ * deterministic/repeatable when the configuration otherwise is the same).
+ */
+#ifndef INIT_HTAB
+# define INIT_HTAB 0
+#endif
+
+/*
+ * Avoid assigning values to errno variable? for some embedding purposes
+ * (linux kernel for example), this is necessary. NOTE: this breaks
+ * the documentation in lzf.h. Avoiding errno has no speed impact.
+ */
+#ifndef AVOID_ERRNO
+# define AVOID_ERRNO 0
+#endif
+
+/*
+ * Whether to pass the LZF_STATE variable as argument, or allocate it
+ * on the stack. For small-stack environments, define this to 1.
+ * NOTE: this breaks the prototype in lzf.h.
+ */
+#ifndef LZF_STATE_ARG
+# define LZF_STATE_ARG 0
+#endif
+
+/*
+ * Whether to add extra checks for input validity in lzf_decompress
+ * and return EINVAL if the input stream has been corrupted. This
+ * only shields against overflowing the input buffer and will not
+ * detect most corrupted streams.
+ * This check is not normally noticeable on modern hardware
+ * (<1% slowdown), but might slow down older cpus considerably.
+ */
+#ifndef CHECK_INPUT
+# define CHECK_INPUT 1
+#endif
+
+/*
+ * Whether to store pointers or offsets inside the hash table. On
+ * 64 bit architetcures, pointers take up twice as much space,
+ * and might also be slower. Default is to autodetect.
+ */
+/*#define LZF_USER_OFFSETS autodetect */
+
+/*****************************************************************************/
+/* nothing should be changed below */
+
+#ifdef __cplusplus
+# include <cstring>
+# include <climits>
+using namespace std;
+#else
+# include <string.h>
+# include <limits.h>
+#endif
+
+#ifndef LZF_USE_OFFSETS
+# if defined (_WIN32)
+#  define LZF_USE_OFFSETS defined(_M_X64)
+# else
+#  if __cplusplus > 199711L
+#   include <cstdint>
+#  else
+#   include <stdint.h>
+#  endif
+#  define LZF_USE_OFFSETS (UINTPTR_MAX > 0xffffffffU)
+# endif
+#endif
+
+typedef unsigned char u8;
+
+#if LZF_USE_OFFSETS
+# define LZF_HSLOT_BIAS ((const u8 *)in_data)
+  typedef unsigned int LZF_HSLOT;
+#else
+# define LZF_HSLOT_BIAS 0
+  typedef const u8 *LZF_HSLOT;
+#endif
+
+typedef LZF_HSLOT LZF_STATE[1 << (HLOG)];
+
+#if !STRICT_ALIGN
+/* for unaligned accesses we need a 16 bit datatype. */
+# if USHRT_MAX == 65535
+    typedef unsigned short u16;
+# elif UINT_MAX == 65535
+    typedef unsigned int u16;
+# else
+#  undef STRICT_ALIGN
+#  define STRICT_ALIGN 1
+# endif
+#endif
+
+#if ULTRA_FAST
+# undef VERY_FAST
+#endif
+
+#endif
+
diff --git a/third_party/liblzf/lzf_c.c b/third_party/liblzf/lzf_c.c
new file mode 100644
index 00000000..0c94400d
--- /dev/null
+++ b/third_party/liblzf/lzf_c.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2000-2010 Marc Alexander Lehmann <schmorp@schmorp.de>
+ * 
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ * 
+ *   1.  Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ * 
+ *   2.  Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ */
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4244)
+#endif
+
+#include "lzfP.h"
+
+#define HSIZE (1 << (HLOG))
+
+/*
+ * don't play with this unless you benchmark!
+ * the data format is not dependent on the hash function.
+ * the hash function might seem strange, just believe me,
+ * it works ;)
+ */
+#ifndef FRST
+# define FRST(p) (((p[0]) << 8) | p[1])
+# define NEXT(v,p) (((v) << 8) | p[2])
+# if ULTRA_FAST
+#  define IDX(h) ((( h             >> (3*8 - HLOG)) - h  ) & (HSIZE - 1))
+# elif VERY_FAST
+#  define IDX(h) ((( h             >> (3*8 - HLOG)) - h*5) & (HSIZE - 1))
+# else
+#  define IDX(h) ((((h ^ (h << 5)) >> (3*8 - HLOG)) - h*5) & (HSIZE - 1))
+# endif
+#endif
+/*
+ * IDX works because it is very similar to a multiplicative hash, e.g.
+ * ((h * 57321 >> (3*8 - HLOG)) & (HSIZE - 1))
+ * the latter is also quite fast on newer CPUs, and compresses similarly.
+ *
+ * the next one is also quite good, albeit slow ;)
+ * (int)(cos(h & 0xffffff) * 1e6)
+ */
+
+#if 0
+/* original lzv-like hash function, much worse and thus slower */
+# define FRST(p) (p[0] << 5) ^ p[1]
+# define NEXT(v,p) ((v) << 5) ^ p[2]
+# define IDX(h) ((h) & (HSIZE - 1))
+#endif
+
+#define        MAX_LIT        (1 <<  5)
+#define        MAX_OFF        (1 << 13)
+#define        MAX_REF        ((1 << 8) + (1 << 3))
+
+#if __GNUC__ >= 3
+# define expect(expr,value)         __builtin_expect ((expr),(value))
+# define inline                     inline
+#else
+# define expect(expr,value)         (expr)
+# define inline                     static
+#endif
+
+#define expect_false(expr) expect ((expr) != 0, 0)
+#define expect_true(expr)  expect ((expr) != 0, 1)
+
+/*
+ * compressed format
+ *
+ * 000LLLLL <L+1>    ; literal, L+1=1..33 octets
+ * LLLooooo oooooooo ; backref L+1=1..7 octets, o+1=1..4096 offset
+ * 111ooooo LLLLLLLL oooooooo ; backref L+8 octets, o+1=1..4096 offset
+ *
+ */
+
+unsigned int
+lzf_compress (const void *const in_data, unsigned int in_len,
+	      void *out_data, unsigned int out_len
+#if LZF_STATE_ARG
+              , LZF_STATE htab
+#endif
+              )
+{
+#if !LZF_STATE_ARG
+  LZF_STATE htab;
+#endif
+  const u8 *ip = (const u8 *)in_data;
+        u8 *op = (u8 *)out_data;
+  const u8 *in_end  = ip + in_len;
+        u8 *out_end = op + out_len;
+  const u8 *ref;
+
+  /* off requires a type wide enough to hold a general pointer difference.
+   * ISO C doesn't have that (size_t might not be enough and ptrdiff_t only
+   * works for differences within a single object). We also assume that no
+   * no bit pattern traps. Since the only platform that is both non-POSIX
+   * and fails to support both assumptions is windows 64 bit, we make a
+   * special workaround for it.
+   */
+#if defined (_WIN32) && defined (_M_X64)
+  unsigned _int64 off; /* workaround for missing POSIX compliance */
+#else
+  unsigned long off;
+#endif
+  unsigned int hval;
+  int lit;
+
+  if (!in_len || !out_len)
+    return 0;
+
+#if INIT_HTAB
+  memset (htab, 0, sizeof (htab));
+#endif
+
+  lit = 0; op++; /* start run */
+
+  hval = FRST (ip);
+  while (ip < in_end - 2)
+    {
+      LZF_HSLOT *hslot;
+
+      hval = NEXT (hval, ip);
+      hslot = htab + IDX (hval);
+      ref = *hslot + LZF_HSLOT_BIAS; *hslot = ip - LZF_HSLOT_BIAS;
+
+      if (1
+#if INIT_HTAB
+          && ref < ip /* the next test will actually take care of this, but this is faster */
+#endif
+          && (off = ip - ref - 1) < MAX_OFF
+          && ref > (u8 *)in_data
+          && ref[2] == ip[2]
+#if STRICT_ALIGN
+          && ((ref[1] << 8) | ref[0]) == ((ip[1] << 8) | ip[0])
+#else
+          && *(u16 *)ref == *(u16 *)ip
+#endif
+        )
+        {
+          /* match found at *ref++ */
+          unsigned int len = 2;
+          unsigned int maxlen = in_end - ip - len;
+          maxlen = maxlen > MAX_REF ? MAX_REF : maxlen;
+
+          if (expect_false (op + 3 + 1 >= out_end)) /* first a faster conservative test */
+            if (op - !lit + 3 + 1 >= out_end) /* second the exact but rare test */
+              return 0;
+
+          op [- lit - 1] = lit - 1; /* stop run */
+          op -= !lit; /* undo run if length is zero */
+
+          for (;;)
+            {
+              if (expect_true (maxlen > 16))
+                {
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+                  len++; if (ref [len] != ip [len]) break;
+                }
+
+              do
+                len++;
+              while (len < maxlen && ref[len] == ip[len]);
+
+              break;
+            }
+
+          len -= 2; /* len is now #octets - 1 */
+          ip++;
+
+          if (len < 7)
+            {
+              *op++ = (off >> 8) + (len << 5);
+            }
+          else
+            {
+              *op++ = (off >> 8) + (  7 << 5);
+              *op++ = len - 7;
+            }
+
+          *op++ = off;
+
+          lit = 0; op++; /* start run */
+
+          ip += len + 1;
+
+          if (expect_false (ip >= in_end - 2))
+            break;
+
+#if ULTRA_FAST || VERY_FAST
+          --ip;
+# if VERY_FAST && !ULTRA_FAST
+          --ip;
+# endif
+          hval = FRST (ip);
+
+          hval = NEXT (hval, ip);
+          htab[IDX (hval)] = ip - LZF_HSLOT_BIAS;
+          ip++;
+
+# if VERY_FAST && !ULTRA_FAST
+          hval = NEXT (hval, ip);
+          htab[IDX (hval)] = ip - LZF_HSLOT_BIAS;
+          ip++;
+# endif
+#else
+          ip -= len + 1;
+
+          do
+            {
+              hval = NEXT (hval, ip);
+              htab[IDX (hval)] = ip - LZF_HSLOT_BIAS;
+              ip++;
+            }
+          while (len--);
+#endif
+        }
+      else
+        {
+          /* one more literal byte we must copy */
+          if (expect_false (op >= out_end))
+            return 0;
+
+          lit++; *op++ = *ip++;
+
+          if (expect_false (lit == MAX_LIT))
+            {
+              op [- lit - 1] = lit - 1; /* stop run */
+              lit = 0; op++; /* start run */
+            }
+        }
+    }
+
+  if (op + 3 > out_end) /* at most 3 bytes can be missing here */
+    return 0;
+
+  while (ip < in_end)
+    {
+      lit++; *op++ = *ip++;
+
+      if (expect_false (lit == MAX_LIT))
+        {
+          op [- lit - 1] = lit - 1; /* stop run */
+          lit = 0; op++; /* start run */
+        }
+    }
+
+  op [- lit - 1] = lit - 1; /* end run */
+  op -= !lit; /* undo run if length is zero */
+
+  return op - (u8 *)out_data;
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
diff --git a/third_party/liblzf/lzf_d.c b/third_party/liblzf/lzf_d.c
new file mode 100644
index 00000000..cc61a17f
--- /dev/null
+++ b/third_party/liblzf/lzf_d.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2000-2010 Marc Alexander Lehmann <schmorp@schmorp.de>
+ * 
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ * 
+ *   1.  Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ * 
+ *   2.  Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ */
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4244)
+#endif
+
+#include "lzfP.h"
+
+#if AVOID_ERRNO
+# define SET_ERRNO(n)
+#else
+# include <errno.h>
+# define SET_ERRNO(n) errno = (n)
+#endif
+
+#if USE_REP_MOVSB /* small win on amd, big loss on intel */
+#if (__i386 || __amd64) && __GNUC__ >= 3
+# define lzf_movsb(dst, src, len)                \
+   asm ("rep movsb"                              \
+        : "=D" (dst), "=S" (src), "=c" (len)     \
+        :  "0" (dst),  "1" (src),  "2" (len));
+#endif
+#endif
+
+unsigned int 
+lzf_decompress (const void *const in_data,  unsigned int in_len,
+                void             *out_data, unsigned int out_len)
+{
+  u8 const *ip = (const u8 *)in_data;
+  u8       *op = (u8 *)out_data;
+  u8 const *const in_end  = ip + in_len;
+  u8       *const out_end = op + out_len;
+
+  do
+    {
+      unsigned int ctrl = *ip++;
+
+      if (ctrl < (1 << 5)) /* literal run */
+        {
+          ctrl++;
+
+          if (op + ctrl > out_end)
+            {
+              SET_ERRNO (E2BIG);
+              return 0;
+            }
+
+#if CHECK_INPUT
+          if (ip + ctrl > in_end)
+            {
+              SET_ERRNO (EINVAL);
+              return 0;
+            }
+#endif
+
+#ifdef lzf_movsb
+          lzf_movsb (op, ip, ctrl);
+#else
+          switch (ctrl)
+            {
+              case 32: *op++ = *ip++; case 31: *op++ = *ip++; case 30: *op++ = *ip++; case 29: *op++ = *ip++;
+              case 28: *op++ = *ip++; case 27: *op++ = *ip++; case 26: *op++ = *ip++; case 25: *op++ = *ip++;
+              case 24: *op++ = *ip++; case 23: *op++ = *ip++; case 22: *op++ = *ip++; case 21: *op++ = *ip++;
+              case 20: *op++ = *ip++; case 19: *op++ = *ip++; case 18: *op++ = *ip++; case 17: *op++ = *ip++;
+              case 16: *op++ = *ip++; case 15: *op++ = *ip++; case 14: *op++ = *ip++; case 13: *op++ = *ip++;
+              case 12: *op++ = *ip++; case 11: *op++ = *ip++; case 10: *op++ = *ip++; case  9: *op++ = *ip++;
+              case  8: *op++ = *ip++; case  7: *op++ = *ip++; case  6: *op++ = *ip++; case  5: *op++ = *ip++;
+              case  4: *op++ = *ip++; case  3: *op++ = *ip++; case  2: *op++ = *ip++; case  1: *op++ = *ip++;
+            }
+#endif
+        }
+      else /* back reference */
+        {
+          unsigned int len = ctrl >> 5;
+
+          u8 *ref = op - ((ctrl & 0x1f) << 8) - 1;
+
+#if CHECK_INPUT
+          if (ip >= in_end)
+            {
+              SET_ERRNO (EINVAL);
+              return 0;
+            }
+#endif
+          if (len == 7)
+            {
+              len += *ip++;
+#if CHECK_INPUT
+              if (ip >= in_end)
+                {
+                  SET_ERRNO (EINVAL);
+                  return 0;
+                }
+#endif
+            }
+
+          ref -= *ip++;
+
+          if (op + len + 2 > out_end)
+            {
+              SET_ERRNO (E2BIG);
+              return 0;
+            }
+
+          if (ref < (u8 *)out_data)
+            {
+              SET_ERRNO (EINVAL);
+              return 0;
+            }
+
+#ifdef lzf_movsb
+          len += 2;
+          lzf_movsb (op, ref, len);
+#else
+          switch (len)
+            {
+              default:
+                len += 2;
+
+                if (op >= ref + len)
+                  {
+                    /* disjunct areas */
+                    memcpy (op, ref, len);
+                    op += len;
+                  }
+                else
+                  {
+                    /* overlapping, use octte by octte copying */
+                    do
+                      *op++ = *ref++;
+                    while (--len);
+                  }
+
+                break;
+
+              case 9: *op++ = *ref++;
+              case 8: *op++ = *ref++;
+              case 7: *op++ = *ref++;
+              case 6: *op++ = *ref++;
+              case 5: *op++ = *ref++;
+              case 4: *op++ = *ref++;
+              case 3: *op++ = *ref++;
+              case 2: *op++ = *ref++;
+              case 1: *op++ = *ref++;
+              case 0: *op++ = *ref++; /* two octets more */
+                      *op++ = *ref++;
+            }
+#endif
+        }
+    }
+  while (ip < in_end);
+
+  return op - (u8 *)out_data;
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif