diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index 4fe64de..0000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,78 +0,0 @@
-cmake_minimum_required(VERSION 2.6)
-
-project(cudaSift)
-set(cudaSift_VERSION_MAJOR 2)
-set(cudaSift_VERSION_MINOR 0)
-set(cudaSift_VERSION_PATCH 0)
-
-set(CPACK_PACKAGE_VERSION_MAJOR "${cudaSift_VERSION_MAJOR}")
-set(CPACK_PACKAGE_VERSION_MINOR "${cudaSift_VERSION_MINOR}")
-set(CPACK_PACKAGE_VERSION_PATCH "${cudaSift_VERSION_PATCH}")
-set(CPACK_GENERATOR "ZIP")
-include(CPack)
-
-find_package(OpenCV REQUIRED)
-find_package(CUDA)
-if (NOT CUDA_FOUND)
-  message(STATUS "CUDA not found. Project will not be built.")
-endif(NOT CUDA_FOUND)
-
-if (WIN32)
-  set(EXTRA_CXX_FLAGS "/DVERBOSE /D_CRT_SECURE_NO_WARNINGS ")
-  list(APPEND CUDA_NVCC_FLAGS "-arch=sm_35;--compiler-options;-O2;-DVERBOSE") 
-endif()
-if (UNIX)
-  if (APPLE)
-    set(EXTRA_CXX_FLAGS "-DVERBOSE -msse2")
-    list(APPEND CUDA_NVCC_FLAGS "-arch=sm_35;--compiler-options;-O2;-DVERBOSE") 
-  else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -msse2 ")
-    list(APPEND CUDA_NVCC_FLAGS "-lineinfo;-ccbin;/usr/bin/gcc-6;--compiler-options;-O2;-D_FORCE_INLINES;-DVERBOSE_NOT") 
-  endif()
-endif()
-
-set(cuda_sources
-  cudaImage.cu  
-  cudaImage.h  
-  cudaSiftH.cu 
-  cudaSiftH.h  
-  matching.cu  
-  cudaSiftD.h  
-  cudaSift.h  
-  cudautils.h
-)  
-
-set(sources
-  geomFuncs.cpp  
-  mainSift.cpp  
-)
-
-include_directories(
-  ${CMAKE_CURRENT_SOURCE_DIR} 
-)
-
-#SET(CUDA_SEPARABLE_COMPILATION ON)
-
-cuda_add_executable(cudasift ${cuda_sources} ${sources} OPTIONS -arch=sm_35)
-
-#cuda_add_executable(l2net l2netD.cu OPTIONS -arch=sm_35)
-
-set_target_properties(cudasift PROPERTIES
-  COMPILE_FLAGS "${EXTRA_CXX_FLAGS}"			   
-)
-
-target_link_libraries(cudasift ${CUDA_cudadevrt_LIBRARY} ${OpenCV_LIBS})
-#  /usr/local/cuda/lib64/libcudadevrt.a ${OpenCV_LIBS} 
-#)
- 
-install(FILES 
-  ${cuda_sources} 
-  ${sources}
-  cudaSiftD.cu
-  CMakeLists.txt
-  Copyright.txt
-  DESTINATION .
-)
-install(FILES data/left.pgm data/righ.pgm
-  DESTINATION data
-)
diff --git a/LICENSE b/LICENSE
index 7d7541e..bee8393 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,21 @@
-MIT License
-
-Copyright (c) 2017 Mårten Björkman
+Modifications Copyright (C) 2023 Intel Corporation
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
 
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SPDX-License-Identifier: MIT
\ No newline at end of file
diff --git a/README.md b/README.md
old mode 100644
new mode 100755
index 9cee717..a520e54
--- a/README.md
+++ b/README.md
@@ -1,134 +1,75 @@
-# CudaSift - SIFT features with CUDA
+# SyclSift
+SyclSift - SIFT features with SYCL.
 
-This is the fourth version of a SIFT (Scale Invariant Feature Transform) implementation using CUDA for GPUs from NVidia. The first version is from 2007 and GPUs have evolved since then. This version is slightly more precise and considerably faster than the previous versions and has been optimized for Kepler and later generations of GPUs.
+# Building SyclSift
+**To build cuda version**
 
-On a GTX 1060 GPU the code takes about 1.2 ms on a 1280x960 pixel image and 1.7 ms on a 1920x1080 pixel image. There is also code for brute-force matching of features that takes about 2.2 ms for two sets of around 1900 SIFT features each.
+mkdir build && cd build
 
-The code relies on CMake for compilation and OpenCV for image containers. OpenCV can however be quite easily changed to something else. The code can be relatively hard to read, given the way things have been parallelized for maximum speed.
+//For A100 Machine
 
-The code is free to use for non-commercial applications. If you use the code for research, please cite to the following paper.
+cmake ../ -DUSE_SM=80
 
-M. Bj&ouml;rkman, N. Bergstr&ouml;m and D. Kragic, "Detecting, segmenting and tracking unknown objects using multi-label MRF inference", CVIU, 118, pp. 111-127, January 2014. [ScienceDirect](http://www.sciencedirect.com/science/article/pii/S107731421300194X)
+//For H100 Machine
 
-## Update in feature matching (2019-05-17)
+cmake ../ -DUSE_SM=90
 
-The brute force feature matcher has been significantly improved in speed. The largest improvements can be seen for large feature sets with 10000 features or more, but as can be seen below, it performs rather well even with just 2000 features. The file [match.pdf](https://github.com/Celebrandil/CudaSift/blob/Pascal/match.pdf) includes a description of the optimizations done in this version.
+make
 
-## New version for Pascal (2018-10-26)
+**To build SYCL version**
 
-There is a new version optimized for Pascal cards, but it should work also on many older cards. Since it includes some bug fixes that changes slightly how features are extracted, which might affect matching to features extracted using an older version, the changes are kept in a new branch (Pascal). The fixes include a small change in ScaleDown that corrects an odd behaviour for images with heights not divisible by 2^(#octaves). The second change is a correction of an improper shift of (0.5,0.5) pixels, when pixel values were read from the image to create a descriptor. 
+mkdir build
 
-Then there are some improvements in terms of speed, especially in the Laplace function, that detects DoG features, and the LowPass function, that is seen as preprocessing and is not included in the benchmarking below. Maybe surprisingly, even if optimizations were done with respect to Pascal cards, these improvements were even better for older cards. The changes involve trying to make each CUDA thread have more work to do, using fewer thread blocks. For typical images of today, there will be enough blocks to feed the streaming multiprocessors anyway.
+cd build
 
-Latest result of version under test:
+#update the path for OpenCV_DIR
 
-|         |                     | 1280x960 | 1920x1080 |  GFLOPS  | Bandwidth | Matching |
-| ------- | ------------------- | -------| ---------| ---------- | --------|--------|
-| Turing  | GeForce RTX 2080 Ti |   0.42* |     0.56* |	11750    |  616    |   0.30* |
-| Pascal  | GeForce GTX 1080 Ti |   0.58* |     0.80* |	10609    |  484    |   0.42* |
-| Pascal  | GeForce GTX 1060    |   1.2 |     1.7 |	3855    |  192    |   2.2 |
-| Maxwell | GeForce GTX 970     |   1.3 |     1.8 |    3494    |  224    |   2.5 |
-| Kepler  | Tesla K40c          |   2.4 |     3.4 |    4291    |  288    |   4.7 |
+CXX=icpx cmake ../ -DGPU_AOT=pvc
 
-Matching is done between two sets of 1911 and 2086 features respectively. A star indicates results from the last checked in version.
+make -sj
 
-## Benchmarking of new version (2018-08-22)
+**To build SYCL version on NVIDIA Backend**
 
-About every 2nd year, I try to update the code to gain even more speed through further optimization. Here are some results for a new version of the code. Improvements in speed have primarilly been gained by reducing communication between host and device, better balancing the load on caches, shared and global memory, and increasing the workload of each thread block.
+source /path/to/clang/
 
-|         |                     | 1280x960 | 1920x1080 |  GFLOPS  | Bandwidth | Matching |
-| ------- | ------------------- | -------| ---------| ---------- | --------|--------|
-| Pascal  | GeForce GTX 1080 Ti |   0.7  |     1.0  |	10609    |  484    |   1.0 |
-| Pascal  | GeForce GTX 1060    |   1.6  |     2.4  |	3855    |  192    |   2.2 |
-| Maxwell | GeForce GTX 970     |   1.9  |     2.8  |    3494    |  224    |   2.5 |
-| Kepler  | Tesla K40c          |   3.1  |     4.7  |    4291    |  288    |   4.7 |
-| Kepler  | GeForce GTX TITAN   |   2.9  |     4.3  |    4500    |  288    |   4.5 |
+mkdir build && cd build
 
-Matching is done between two sets of 1818 and 1978 features respectively. 
+//For A100 Machine
 
-It's questionable whether further optimization really makes sense, given that the cost of just transfering an 1920x1080 pixel image to the device takes about 1.4 ms on a GTX 1080 Ti. Even if the brute force feature matcher is not much faster than earlier versions, it does not have the same O(N^2) temporary memory overhead, which is preferable if there are many features.
+CC=clang CXX=clang++ cmake ../ -DUSE_NVIDIA_BACKEND=YES -DUSE_SM=80 
 
-## Benchmarking of previous version (2017-05-24)
+//For H100 Machine
 
-Computational cost (in milliseconds) on different GPUs:
+CC=clang CXX=clang++ cmake ../ -DUSE_NVIDIA_BACKEND=YES -DUSE_SM=90
 
-|         |                     | 1280x960 | 1920x1080 |  GFLOPS  | Bandwidth | Matching |
-| ------- | ------------------- | -------| ---------| ---------- | --------|--------|
-| Pascal  | GeForce GTX 1080 Ti |   1.7  |     2.3  |	10609    |  484    |   1.4 |
-| Pascal  | GeForce GTX 1060    |   2.7  |     4.0  |	 3855    |  192    |   2.6 |
-| Maxwell | GeForce GTX 970     |   3.8  |     5.6  |    3494    |  224    |   2.8 |
-| Kepler  | Tesla K40c          |   5.4  |     8.0  |    4291    |  288    |   5.5 |
-| Kepler  | GeForce GTX TITAN   |   4.4  |     6.6  |    4500    |  288    |   4.6 |
+make -sj
 
-Matching is done between two sets of 1616 and 1769 features respectively. 
- 
-The improvements in this version involved a slight adaptation for Pascal, changing from textures to global memory (mostly through L2) in the most costly function LaplaceMulti. The medium-end card GTX 1060 is impressive indeed. 
+**To build SYCL version on AMD Backend**
 
-## Usage
+source /path/to/clang/
 
-There are two different containers for storing data on the host and on the device; *SiftData* for SIFT features and *CudaImage* for images. Since memory allocation on GPUs is slow, it's usually preferable to preallocate a sufficient amount of memory using *InitSiftData()*, in particular if SIFT features are extracted from a continuous stream of video camera images. On repeated calls *ExtractSift()* will reuse memory previously allocated.
-~~~c
-#include <opencv2/core/core.hpp>
-#include <opencv2/highgui/highgui.hpp>
-#include <cudaImage.h>
-#include <cudaSift.h>
+mkdir build && cd build
 
-/* Reserve memory space for a whole bunch of SIFT features. */
-SiftData siftData;
-InitSiftData(siftData, 25000, true, true);
+//For MI-100 Machine
 
-/* Read image using OpenCV and convert to floating point. */
-cv::Mat limg;
-cv::imread("image.png", 0).convertTo(limg, CV32FC1);
-/* Allocate 1280x960 pixel image with device side pitch of 1280 floats. */ 
-/* Memory on host side already allocated by OpenCV is reused.           */
-CudaImage img;
-img.Allocate(1280, 960, 1280, false, NULL, (float*) limg.data);
-/* Download image from host to device */
-img.Download();
+CC=clang CXX=clang++ cmake ../ -DUSE_AMDHIP_BACKEND=gfx908
 
-int numOctaves = 5;    /* Number of octaves in Gaussian pyramid */
-float initBlur = 1.0f; /* Amount of initial Gaussian blurring in standard deviations */
-float thresh = 3.5f;   /* Threshold on difference of Gaussians for feature pruning */
-float minScale = 0.0f; /* Minimum acceptable scale to remove fine-scale features */
-bool upScale = false;  /* Whether to upscale image before extraction */
-/* Extract SIFT features */
-ExtractSift(siftData, img, numOctaves, initBlur, thresh, minScale, upScale);
-...
-/* Free space allocated from SIFT features */
-FreeSiftData(siftData);
+//For MI-250 Machine
 
-~~~
+CC=clang CXX=clang++ cmake ../ -DUSE_AMDHIP_BACKEND=gfx90a
 
-## Parameter setting
+make -sj
 
-The requirements on number and quality of features vary from application to application. Some applications benefit from a smaller number of high quality features, while others require as many features as possible. More distinct features with higher DoG (difference of Gaussians) responses tend to be of higher quality and are easier to match between multiple views. With the parameter *thresh* a threshold can be set on the minimum DoG to prune features of less quality. 
+# Running SyclSift
 
-In many cases the most fine-scale features are of little use, especially when noise conditions are severe or when features are matched between very different views. In such cases the most fine-scale features can be pruned by setting *minScale* to the minimum acceptable feature scale, where 1.0 corresponds to the original image scale without upscaling. As a consequence of pruning the computational cost can also be reduced.
+**To run sycl version**
 
-To increase the number of SIFT features, but also increase the computational cost, the original image can be automatically upscaled to double the size using the *upScale* parameter, in accordance to Lowe's recommendations. One should keep in mind though that by doing so the fraction of features that can be matched tend to go down, even if the total number of extracted features increases significantly. If it's enough to instead reduce the *thresh* parameter to get more features, that is often a better alternative.
+./syclsift
 
-Results without upscaling (upScale=False) of 1280x960 pixel input image. 
+**To run SYCL on NVIDIA Backend**
 
-| *thresh* | #Matches | %Matches | Cost (ms) |
-|-----------|----------|----------|-----------|
-|    1.0    |   4236   |   40.4%  |    5.8    |
-|    1.5    |   3491   |   42.5%  |    5.2    |
-|    2.0    |   2720   |   43.2%  |    4.7    |
-|    2.5    |   2121   |   44.4%  |    4.2    |
-|    3.0    |   1627   |   45.8%  |    3.9    |
-|    3.5    |   1189   |   46.2%  |    3.6    |
-|    4.0    |    881   |   48.5%  |    3.3    |
-
-
-Results with upscaling (upScale=True) of 1280x960 pixel input image.
-
-| *thresh* | #Matches | %Matches | Cost (ms) |
-|-----------|----------|----------|-----------|
-|    2.0    |   4502   |   34.9%  |   13.2    |
-|    2.5    |   3389   |   35.9%  |   11.2    |
-|    3.0    |   2529   |   37.1%  |   10.6    |
-|    3.5    |   1841   |   38.3%  |    9.9    |
-|    4.0    |   1331   |   39.8%  |    9.5    |
-|    4.5    |    954   |   42.2%  |    9.3    |
-|    5.0    |    611   |   39.3%  |    9.1    |
+./syclsift
+
+**To run SYCL on AMD Backend**
+
+ONEAPI_DEVICE_SELECTOR=hip:* ./syclsift
diff --git a/common/Utility.cpp b/common/Utility.cpp
new file mode 100644
index 0000000..b254877
--- /dev/null
+++ b/common/Utility.cpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+
+#include <iostream>
+
+#include "Utility.h"
+
+using namespace Utility;
+
+void Utility::RunDataVerification(const int threshold, const float matchPercentage)
+{
+    printf("Performing data verification \n");
+    switch (threshold)
+    {
+    case 1:
+        if (matchPercentage > 20.0f && matchPercentage < 30.0f)
+        {
+            printf("Data verification is SUCCESSFUL. \n\n");
+        }
+        else
+        {
+            printf("Data verification FAILED. \n\n");
+        }
+        break;
+    case 2:
+        if (matchPercentage > 26.0f && matchPercentage < 38.0f)
+        {
+            printf("Data verification is SUCCESSFUL. \n\n");
+        }
+        else
+        {
+            printf("Data verification FAILED. \n\n");
+        }
+        break;
+    case 3:
+        if (matchPercentage > 35.0f && matchPercentage < 45.0f)
+        {
+            printf("Data verification is SUCCESSFUL. \n\n");
+        }
+        else
+        {
+            printf("Data verification FAILED. \n\n");
+        }
+        break;
+    case 4:
+        if (matchPercentage > 40.0f && matchPercentage < 50.0f)
+        {
+            printf("Data verification is SUCCESSFUL. \n\n");
+        }
+        else
+        {
+            printf("Data verification FAILED. \n\n");
+        }
+        break;
+    default:
+        printf("Threshold values should be in the range [1, 4]. \n\n");
+    }
+}
diff --git a/common/Utility.h b/common/Utility.h
new file mode 100644
index 0000000..0a6cff3
--- /dev/null
+++ b/common/Utility.h
@@ -0,0 +1,31 @@
+// Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+
+#ifndef UTILITY_H
+#define UTILITY_H
+
+namespace Utility
+{
+    void RunDataVerification(const int thresh, const float matchPercentage);
+
+}
+#endif // UTILITY_H
diff --git a/cudaImage.cu b/cudaImage.cu
deleted file mode 100644
index db9182e..0000000
--- a/cudaImage.cu
+++ /dev/null
@@ -1,115 +0,0 @@
-//********************************************************//
-// CUDA SIFT extractor by Marten Bjorkman aka Celebrandil //
-//********************************************************//  
-
-#include <cstdio>
-
-#include "cudautils.h"
-#include "cudaImage.h"
-
-int iDivUp(int a, int b) { return (a%b != 0) ? (a/b + 1) : (a/b); }
-int iDivDown(int a, int b) { return a/b; }
-int iAlignUp(int a, int b) { return (a%b != 0) ?  (a - a%b + b) : a; }
-int iAlignDown(int a, int b) { return a - a%b; }
-
-void CudaImage::Allocate(int w, int h, int p, bool host, float *devmem, float *hostmem) 
-{
-  width = w;
-  height = h; 
-  pitch = p; 
-  d_data = devmem;
-  h_data = hostmem; 
-  t_data = NULL; 
-  if (devmem==NULL) {
-    safeCall(cudaMallocPitch((void **)&d_data, (size_t*)&pitch, (size_t)(sizeof(float)*width), (size_t)height));
-    pitch /= sizeof(float);
-    if (d_data==NULL) 
-      printf("Failed to allocate device data\n");
-    d_internalAlloc = true;
-  }
-  if (host && hostmem==NULL) {
-    h_data = (float *)malloc(sizeof(float)*pitch*height);
-    h_internalAlloc = true;
-  }
-}
-
-CudaImage::CudaImage() : 
-  width(0), height(0), d_data(NULL), h_data(NULL), t_data(NULL), d_internalAlloc(false), h_internalAlloc(false)
-{
-
-}
-
-CudaImage::~CudaImage()
-{
-  if (d_internalAlloc && d_data!=NULL) 
-    safeCall(cudaFree(d_data));
-  d_data = NULL;
-  if (h_internalAlloc && h_data!=NULL) 
-    free(h_data);
-  h_data = NULL;
-  if (t_data!=NULL) 
-    safeCall(cudaFreeArray((cudaArray *)t_data));
-  t_data = NULL;
-}
-  
-double CudaImage::Download()  
-{
-  TimerGPU timer(0);
-  int p = sizeof(float)*pitch;
-  if (d_data!=NULL && h_data!=NULL) 
-    safeCall(cudaMemcpy2D(d_data, p, h_data, sizeof(float)*width, sizeof(float)*width, height, cudaMemcpyHostToDevice));
-  double gpuTime = timer.read();
-#ifdef VERBOSE
-  printf("Download time =               %.2f ms\n", gpuTime);
-#endif
-  return gpuTime;
-}
-
-double CudaImage::Readback()
-{
-  TimerGPU timer(0);
-  int p = sizeof(float)*pitch;
-  safeCall(cudaMemcpy2D(h_data, sizeof(float)*width, d_data, p, sizeof(float)*width, height, cudaMemcpyDeviceToHost));
-  double gpuTime = timer.read();
-#ifdef VERBOSE
-  printf("Readback time =               %.2f ms\n", gpuTime);
-#endif
-  return gpuTime;
-}
-
-double CudaImage::InitTexture()
-{
-  TimerGPU timer(0);
-  cudaChannelFormatDesc t_desc = cudaCreateChannelDesc<float>(); 
-  safeCall(cudaMallocArray((cudaArray **)&t_data, &t_desc, pitch, height)); 
-  if (t_data==NULL)
-    printf("Failed to allocated texture data\n");
-  double gpuTime = timer.read();
-#ifdef VERBOSE
-  printf("InitTexture time =            %.2f ms\n", gpuTime);
-#endif
-  return gpuTime;
-}
- 
-double CudaImage::CopyToTexture(CudaImage &dst, bool host)
-{
-  if (dst.t_data==NULL) {
-    printf("Error CopyToTexture: No texture data\n");
-    return 0.0;
-  }
-  if ((!host || h_data==NULL) && (host || d_data==NULL)) {
-    printf("Error CopyToTexture: No source data\n");
-    return 0.0;
-  }
-  TimerGPU timer(0);
-  if (host)
-    safeCall(cudaMemcpyToArray((cudaArray *)dst.t_data, 0, 0, h_data, sizeof(float)*pitch*dst.height, cudaMemcpyHostToDevice));
-  else
-    safeCall(cudaMemcpyToArray((cudaArray *)dst.t_data, 0, 0, d_data, sizeof(float)*pitch*dst.height, cudaMemcpyDeviceToDevice));
-  safeCall(cudaDeviceSynchronize());
-  double gpuTime = timer.read();
-#ifdef VERBOSE
-  printf("CopyToTexture time =          %.2f ms\n", gpuTime);
-#endif
-  return gpuTime;
-}
diff --git a/cudaImage.h b/cudaImage.h
deleted file mode 100644
index 8d4a47d..0000000
--- a/cudaImage.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//********************************************************//
-// CUDA SIFT extractor by Marten Bjorkman aka Celebrandil //
-//********************************************************//  
-
-#ifndef CUDAIMAGE_H
-#define CUDAIMAGE_H
-
-class CudaImage {
-public:
-  int width, height;
-  int pitch;
-  float *h_data;
-  float *d_data;
-  float *t_data;
-  bool d_internalAlloc;
-  bool h_internalAlloc;
-public:
-  CudaImage();
-  ~CudaImage();
-  void Allocate(int width, int height, int pitch, bool withHost, float *devMem = NULL, float *hostMem = NULL);
-  double Download();
-  double Readback();
-  double InitTexture();
-  double CopyToTexture(CudaImage &dst, bool host);
-};
-
-int iDivUp(int a, int b);
-int iDivDown(int a, int b);
-int iAlignUp(int a, int b);
-int iAlignDown(int a, int b);
-void StartTimer(unsigned int *hTimer);
-double StopTimer(unsigned int hTimer);
-
-#endif // CUDAIMAGE_H
diff --git a/cudaSift.h b/cudaSift.h
deleted file mode 100644
index adc00a5..0000000
--- a/cudaSift.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef CUDASIFT_H
-#define CUDASIFT_H
-
-#include "cudaImage.h"
-
-typedef struct {
-  float xpos;
-  float ypos;   
-  float scale;
-  float sharpness;
-  float edgeness;
-  float orientation;
-  float score;
-  float ambiguity;
-  int match;
-  float match_xpos;
-  float match_ypos;
-  float match_error;
-  float subsampling;
-  float empty[3];
-  float data[128];
-} SiftPoint;
-
-typedef struct {
-  int numPts;         // Number of available Sift points
-  int maxPts;         // Number of allocated Sift points
-#ifdef MANAGEDMEM
-  SiftPoint *m_data;  // Managed data
-#else
-  SiftPoint *h_data;  // Host (CPU) data
-  SiftPoint *d_data;  // Device (GPU) data
-#endif
-} SiftData;
-
-void InitCuda(int devNum = 0);
-float *AllocSiftTempMemory(int width, int height, int numOctaves, bool scaleUp = false);
-void FreeSiftTempMemory(float *memoryTmp);
-void ExtractSift(SiftData &siftData, CudaImage &img, int numOctaves, double initBlur, float thresh, float lowestScale = 0.0f, bool scaleUp = false, float *tempMemory = 0);
-void InitSiftData(SiftData &data, int num = 1024, bool host = false, bool dev = true);
-void FreeSiftData(SiftData &data);
-void PrintSiftData(SiftData &data);
-double MatchSiftData(SiftData &data1, SiftData &data2);
-double FindHomography(SiftData &data,  float *homography, int *numMatches, int numLoops = 1000, float minScore = 0.85f, float maxAmbiguity = 0.95f, float thresh = 5.0f);
-
-#endif
diff --git a/cudaSiftD.cu b/cudaSiftD.cu
deleted file mode 100644
index 67d82f1..0000000
--- a/cudaSiftD.cu
+++ /dev/null
@@ -1,2038 +0,0 @@
-//********************************************************//
-// CUDA SIFT extractor by Marten Bjorkman aka Celebrandil //
-//********************************************************//  
-
-#include "cudautils.h"
-#include "cudaSiftD.h"
-#include "cudaSift.h"
-
-///////////////////////////////////////////////////////////////////////////////
-// Kernel configuration
-///////////////////////////////////////////////////////////////////////////////
-
-__constant__ int d_MaxNumPoints;
-__device__ unsigned int d_PointCounter[8*2+1];
-__constant__ float d_ScaleDownKernel[5]; 
-__constant__ float d_LowPassKernel[2*LOWPASS_R+1]; 
-__constant__ float d_LaplaceKernel[8*12*16]; 
-
-///////////////////////////////////////////////////////////////////////////////
-// Lowpass filter and subsample image
-///////////////////////////////////////////////////////////////////////////////
-__global__ void ScaleDownDenseShift(float *d_Result, float *d_Data, int width, int pitch, int height, int newpitch)
-{
-#define BW (SCALEDOWN_W+4)
-#define BH (SCALEDOWN_H+4)
-#define W2 (SCALEDOWN_W/2)
-#define H2 (SCALEDOWN_H/2)
-  __shared__ float brows[BH*BW];
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int xp = blockIdx.x*SCALEDOWN_W + tx;
-  const int yp = blockIdx.y*SCALEDOWN_H + ty;
-  const float k0 = d_ScaleDownKernel[0];
-  const float k1 = d_ScaleDownKernel[1];
-  const float k2 = d_ScaleDownKernel[2];
-  const int xl = min(width-1,  max(0, xp-2));
-  const int yl = min(height-1, max(0, yp-2));
-  if (xp<(width+4) && yp<(height+4)) {
-    float v = d_Data[yl*pitch + xl];
-    brows[BW*ty + tx]  = k0*(v + ShiftDown(v, 4)) + k1*(ShiftDown(v, 1) + ShiftDown(v, 3)) + k2*ShiftDown(v, 2);
-  }
-  __syncthreads();
-  const int xs = blockIdx.x*W2 + tx;
-  const int ys = blockIdx.y*H2 + ty;
-  if (tx<W2 && ty<H2 && xs<(width/2) && ys<(height/2)) {
-    float *ptr = &brows[BW*(ty*2) + (tx*2)];
-    d_Result[ys*newpitch + xs] = k0*(ptr[0] + ptr[4*BW]) + k1*(ptr[1*BW] + ptr[3*BW]) + k2*ptr[2*BW];
-  } 
-}
-
-__global__ void ScaleDownDense(float *d_Result, float *d_Data, int width, int pitch, int height, int newpitch)
-{
-#define BW (SCALEDOWN_W+4)
-#define BH (SCALEDOWN_H+4)
-#define W2 (SCALEDOWN_W/2)
-#define H2 (SCALEDOWN_H/2)
-  __shared__ float irows[BH*BW]; 
-  __shared__ float brows[BH*W2];
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int xp = blockIdx.x*SCALEDOWN_W + tx;
-  const int yp = blockIdx.y*SCALEDOWN_H + ty;
-  const int xl = min(width-1,  max(0, xp-2));
-  const int yl = min(height-1, max(0, yp-2));
-  const float k0 = d_ScaleDownKernel[0];
-  const float k1 = d_ScaleDownKernel[1];
-  const float k2 = d_ScaleDownKernel[2];
-  if (xp<(width+4) && yp<(height+4))
-    irows[BW*ty + tx] = d_Data[yl*pitch + xl];
-  __syncthreads();
-  if (yp<(height+4) && tx<W2) {
-    float *ptr = &irows[BW*ty + 2*tx];
-    brows[W2*ty + tx] = k0*(ptr[0] + ptr[4]) + k1*(ptr[1] + ptr[3]) + k2*ptr[2];
-  }
-  __syncthreads();
-  const int xs = blockIdx.x*W2 + tx;
-  const int ys = blockIdx.y*H2 + ty;
-  if (tx<W2 && ty<H2 && xs<(width/2) && ys<(height/2)) {
-    float *ptr = &brows[W2*(ty*2) + tx];
-    d_Result[ys*newpitch + xs] = k0*(ptr[0] + ptr[4*W2]) + k1*(ptr[1*W2] + ptr[3*W2]) + k2*ptr[2*W2];
-  } 
-}
-
-__global__ void ScaleDown(float *d_Result, float *d_Data, int width, int pitch, int height, int newpitch)
-{
-  __shared__ float inrow[SCALEDOWN_W+4]; 
-  __shared__ float brow[5*(SCALEDOWN_W/2)];
-  __shared__ int yRead[SCALEDOWN_H+4];
-  __shared__ int yWrite[SCALEDOWN_H+4];
-  #define dx2 (SCALEDOWN_W/2)
-  const int tx = threadIdx.x;
-  const int tx0 = tx + 0*dx2;
-  const int tx1 = tx + 1*dx2;
-  const int tx2 = tx + 2*dx2;
-  const int tx3 = tx + 3*dx2;
-  const int tx4 = tx + 4*dx2;
-  const int xStart = blockIdx.x*SCALEDOWN_W;
-  const int yStart = blockIdx.y*SCALEDOWN_H;
-  const int xWrite = xStart/2 + tx;
-  float k0 = d_ScaleDownKernel[0];
-  float k1 = d_ScaleDownKernel[1];
-  float k2 = d_ScaleDownKernel[2];
-  if (tx<SCALEDOWN_H+4) {
-    int y = yStart + tx - 2; 
-    y = (y<0 ? 0 : y);
-    y = (y>=height ? height-1 : y);
-    yRead[tx] = y*pitch;
-    yWrite[tx] = (yStart + tx - 4)/2 * newpitch;
-  }
-  __syncthreads();
-  int xRead = xStart + tx - 2;
-  xRead = (xRead<0 ? 0 : xRead);
-  xRead = (xRead>=width ? width-1 : xRead);
-
-  int maxtx = min(dx2, width/2 - xStart/2);
-  for (int dy=0;dy<SCALEDOWN_H+4;dy+=5) {
-    {
-      inrow[tx] = d_Data[yRead[dy+0] + xRead];
-      __syncthreads();
-      if (tx<maxtx) {
-	brow[tx4] = k0*(inrow[2*tx]+inrow[2*tx+4]) + k1*(inrow[2*tx+1]+inrow[2*tx+3]) + k2*inrow[2*tx+2];
-	if (dy>=4 && !(dy&1))
-	  d_Result[yWrite[dy+0] + xWrite] = k2*brow[tx2] + k0*(brow[tx0]+brow[tx4]) + k1*(brow[tx1]+brow[tx3]);
-      }
-      __syncthreads();
-    }
-    if (dy<(SCALEDOWN_H+3)) {
-      inrow[tx] = d_Data[yRead[dy+1] + xRead];
-      __syncthreads();
-      if (tx<maxtx) {
-	brow[tx0] = k0*(inrow[2*tx]+inrow[2*tx+4]) + k1*(inrow[2*tx+1]+inrow[2*tx+3]) + k2*inrow[2*tx+2];
-	if (dy>=3 && (dy&1))
-	  d_Result[yWrite[dy+1] + xWrite] = k2*brow[tx3] + k0*(brow[tx1]+brow[tx0]) + k1*(brow[tx2]+brow[tx4]);
-      }
-      __syncthreads();
-    }
-    if (dy<(SCALEDOWN_H+2)) {
-      inrow[tx] = d_Data[yRead[dy+2] + xRead];
-      __syncthreads();
-      if (tx<maxtx) {
-	brow[tx1] = k0*(inrow[2*tx]+inrow[2*tx+4]) + k1*(inrow[2*tx+1]+inrow[2*tx+3]) + k2*inrow[2*tx+2];
-	if (dy>=2 && !(dy&1))
-	  d_Result[yWrite[dy+2] + xWrite] = k2*brow[tx4] + k0*(brow[tx2]+brow[tx1]) + k1*(brow[tx3]+brow[tx0]);
-      }
-      __syncthreads();
-    }
-    if (dy<(SCALEDOWN_H+1)) {
-      inrow[tx] = d_Data[yRead[dy+3] + xRead];
-      __syncthreads();
-      if (tx<maxtx) {
-	brow[tx2] = k0*(inrow[2*tx]+inrow[2*tx+4]) + k1*(inrow[2*tx+1]+inrow[2*tx+3]) + k2*inrow[2*tx+2];
-	if (dy>=1 && (dy&1))
-	  d_Result[yWrite[dy+3] + xWrite] = k2*brow[tx0] + k0*(brow[tx3]+brow[tx2]) + k1*(brow[tx4]+brow[tx1]);
-      }
-      __syncthreads();
-    }
-    if (dy<SCALEDOWN_H) {
-      inrow[tx] = d_Data[yRead[dy+4] + xRead];
-      __syncthreads();
-      if (tx<dx2 && xWrite<width/2) {
-	brow[tx3] = k0*(inrow[2*tx]+inrow[2*tx+4]) + k1*(inrow[2*tx+1]+inrow[2*tx+3]) + k2*inrow[2*tx+2];
-	if (!(dy&1))
-	  d_Result[yWrite[dy+4] + xWrite] = k2*brow[tx1] + k0*(brow[tx4]+brow[tx3]) + k1*(brow[tx0]+brow[tx2]);
-      }
-      __syncthreads();
-    }
-  }
-}
-
-__global__ void ScaleUp(float *d_Result, float *d_Data, int width, int pitch, int height, int newpitch)
-{
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  int x = blockIdx.x*SCALEUP_W + 2*tx;
-  int y = blockIdx.y*SCALEUP_H + 2*ty;
-  if (x<2*width && y<2*height) {
-    int xl = blockIdx.x*(SCALEUP_W/2) + tx;
-    int yu = blockIdx.y*(SCALEUP_H/2) + ty;
-    int xr = min(xl + 1, width - 1);
-    int yd = min(yu + 1, height - 1);
-    float vul = d_Data[yu*pitch + xl];
-    float vur = d_Data[yu*pitch + xr];
-    float vdl = d_Data[yd*pitch + xl];
-    float vdr = d_Data[yd*pitch + xr];
-    d_Result[(y + 0)*newpitch + x + 0] = vul;
-    d_Result[(y + 0)*newpitch + x + 1] = 0.50f*(vul + vur);
-    d_Result[(y + 1)*newpitch + x + 0] = 0.50f*(vul + vdl);
-    d_Result[(y + 1)*newpitch + x + 1] = 0.25f*(vul + vur + vdl + vdr);
-  }
-}
-
-__global__ void ExtractSiftDescriptors(cudaTextureObject_t texObj, SiftPoint *d_sift, int fstPts, float subsampling)
-{
-  __shared__ float gauss[16];
-  __shared__ float buffer[128];
-  __shared__ float sums[4];
-
-  const int tx = threadIdx.x; // 0 -> 16
-  const int ty = threadIdx.y; // 0 -> 8
-  const int idx = ty*16 + tx;
-  const int bx = blockIdx.x + fstPts;  // 0 -> numPts
-  if (ty==0)
-    gauss[tx] = exp(-(tx-7.5f)*(tx-7.5f)/128.0f);
-  buffer[idx] = 0.0f;
-  __syncthreads();
-
-  // Compute angles and gradients
-  float theta = 2.0f*3.1415f/360.0f*d_sift[bx].orientation;
-  float sina = sinf(theta);           // cosa -sina
-  float cosa = cosf(theta);           // sina  cosa
-  float scale = 12.0f/16.0f*d_sift[bx].scale;
-  float ssina = scale*sina; 
-  float scosa = scale*cosa;
-
-  for (int y=ty;y<16;y+=8) {
-    float xpos = d_sift[bx].xpos + (tx-7.5f)*scosa - (y-7.5f)*ssina + 0.5f;
-    float ypos = d_sift[bx].ypos + (tx-7.5f)*ssina + (y-7.5f)*scosa + 0.5f;
-    float dx = tex2D<float>(texObj, xpos+cosa, ypos+sina) - 
-      tex2D<float>(texObj, xpos-cosa, ypos-sina);
-    float dy = tex2D<float>(texObj, xpos-sina, ypos+cosa) - 
-      tex2D<float>(texObj, xpos+sina, ypos-cosa);
-    float grad = gauss[y]*gauss[tx] * sqrtf(dx*dx + dy*dy);
-    float angf = 4.0f/3.1415f*atan2f(dy, dx) + 4.0f;
-    
-    int hori = (tx + 2)/4 - 1;      // Convert from (tx,y,angle) to bins      
-    float horf = (tx - 1.5f)/4.0f - hori;
-    float ihorf = 1.0f - horf;           
-    int veri = (y + 2)/4 - 1;
-    float verf = (y - 1.5f)/4.0f - veri;
-    float iverf = 1.0f - verf;
-    int angi = angf;
-    int angp = (angi<7 ? angi+1 : 0);
-    angf -= angi;
-    float iangf = 1.0f - angf;
-    
-    int hist = 8*(4*veri + hori);   // Each gradient measure is interpolated 
-    int p1 = angi + hist;           // in angles, xpos and ypos -> 8 stores
-    int p2 = angp + hist;
-    if (tx>=2) { 
-      float grad1 = ihorf*grad;
-      if (y>=2) {   // Upper left
-        float grad2 = iverf*grad1;
-	atomicAdd(buffer + p1, iangf*grad2);
-	atomicAdd(buffer + p2,  angf*grad2);
-      }
-      if (y<=13) {  // Lower left
-        float grad2 = verf*grad1;
-	atomicAdd(buffer + p1+32, iangf*grad2); 
-	atomicAdd(buffer + p2+32,  angf*grad2);
-      }
-    }
-    if (tx<=13) { 
-      float grad1 = horf*grad;
-      if (y>=2) {    // Upper right
-        float grad2 = iverf*grad1;
-	atomicAdd(buffer + p1+8, iangf*grad2);
-	atomicAdd(buffer + p2+8,  angf*grad2);
-      }
-      if (y<=13) {   // Lower right
-        float grad2 = verf*grad1;
-	atomicAdd(buffer + p1+40, iangf*grad2);
-	atomicAdd(buffer + p2+40,  angf*grad2);
-      }
-    }
-  }
-  __syncthreads();
-
-  // Normalize twice and suppress peaks first time
-  float sum = buffer[idx]*buffer[idx];
-  for (int i=16;i>0;i/=2)
-    sum += ShiftDown(sum, i);
-  if ((idx&31)==0)
-    sums[idx/32] = sum;
-  __syncthreads();
-  float tsum1 = sums[0] + sums[1] + sums[2] + sums[3]; 
-  tsum1 = min(buffer[idx] * rsqrtf(tsum1), 0.2f);
-  
-  sum = tsum1*tsum1; 
-  for (int i=16;i>0;i/=2)
-    sum += ShiftDown(sum, i);
-  if ((idx&31)==0)
-    sums[idx/32] = sum;
-  __syncthreads();
-
-  float tsum2 = sums[0] + sums[1] + sums[2] + sums[3];
-  float *desc = d_sift[bx].data;
-  desc[idx] = tsum1 * rsqrtf(tsum2);
-  if (idx==0) {
-    d_sift[bx].xpos *= subsampling;
-    d_sift[bx].ypos *= subsampling;
-    d_sift[bx].scale *= subsampling;
-  }
-}
-
-__device__ float FastAtan2(float y, float x)
-{
-  float absx = abs(x);
-  float absy = abs(y);
-  float a = __fdiv_rn(min(absx, absy),  max(absx, absy));
-  float s = a*a;
-  float r = ((-0.0464964749f*s + 0.15931422f)*s - 0.327622764f)*s*a + a;
-  r = (absy>absx ? 1.57079637f - r : r);
-  r = (x<0 ? 3.14159274f - r : r);
-  r = (y<0 ? -r : r);
-  return r;
-}
-       
-__global__ void ExtractSiftDescriptorsCONSTNew(cudaTextureObject_t texObj, SiftPoint *d_sift, float subsampling, int octave)
-{
-  __shared__ float gauss[16];
-  __shared__ float buffer[128];
-  __shared__ float sums[4];
-
-  const int tx = threadIdx.x; // 0 -> 16
-  const int ty = threadIdx.y; // 0 -> 8
-  const int idx = ty*16 + tx;
-  if (ty==0)
-    gauss[tx] = __expf(-(tx-7.5f)*(tx-7.5f)/128.0f);
-
-  int fstPts = min(d_PointCounter[2*octave-1], d_MaxNumPoints);
-  int totPts = min(d_PointCounter[2*octave+1], d_MaxNumPoints);
-  //if (tx==0 && ty==0)
-  //  printf("%d %d %d %d\n", octave, fstPts, min(d_PointCounter[2*octave], d_MaxNumPoints), totPts); 
-  for (int bx = blockIdx.x + fstPts; bx < totPts; bx += gridDim.x) {
-    
-    buffer[idx] = 0.0f;
-    __syncthreads();
-
-    // Compute angles and gradients
-    float theta = 2.0f*3.1415f/360.0f*d_sift[bx].orientation;
-    float sina = __sinf(theta);           // cosa -sina
-    float cosa = __cosf(theta);           // sina  cosa
-    float scale = 12.0f/16.0f*d_sift[bx].scale;
-    float ssina = scale*sina; 
-    float scosa = scale*cosa;
-    
-    for (int y=ty;y<16;y+=8) {
-      float xpos = d_sift[bx].xpos + (tx-7.5f)*scosa - (y-7.5f)*ssina + 0.5f; 
-      float ypos = d_sift[bx].ypos + (tx-7.5f)*ssina + (y-7.5f)*scosa + 0.5f;
-      float dx = tex2D<float>(texObj, xpos+cosa, ypos+sina) - 
-	tex2D<float>(texObj, xpos-cosa, ypos-sina);
-      float dy = tex2D<float>(texObj, xpos-sina, ypos+cosa) - 
-	tex2D<float>(texObj, xpos+sina, ypos-cosa);
-      float grad = gauss[y]*gauss[tx] * __fsqrt_rn(dx*dx + dy*dy);
-      float angf = 4.0f/3.1415f*FastAtan2(dy, dx) + 4.0f;
-      
-      int hori = (tx + 2)/4 - 1;      // Convert from (tx,y,angle) to bins      
-      float horf = (tx - 1.5f)/4.0f - hori;
-      float ihorf = 1.0f - horf;           
-      int veri = (y + 2)/4 - 1;
-      float verf = (y - 1.5f)/4.0f - veri;
-      float iverf = 1.0f - verf;
-      int angi = angf;
-      int angp = (angi<7 ? angi+1 : 0);
-      angf -= angi;
-      float iangf = 1.0f - angf;
-      
-      int hist = 8*(4*veri + hori);   // Each gradient measure is interpolated 
-      int p1 = angi + hist;           // in angles, xpos and ypos -> 8 stores
-      int p2 = angp + hist;
-      if (tx>=2) { 
-	float grad1 = ihorf*grad;
-	if (y>=2) {   // Upper left
-	  float grad2 = iverf*grad1;
-	  atomicAdd(buffer + p1, iangf*grad2);
-	  atomicAdd(buffer + p2,  angf*grad2);
-	}
-	if (y<=13) {  // Lower left
-	  float grad2 = verf*grad1;
-	  atomicAdd(buffer + p1+32, iangf*grad2); 
-	  atomicAdd(buffer + p2+32,  angf*grad2);
-	}
-      }
-      if (tx<=13) { 
-	float grad1 = horf*grad;
-	if (y>=2) {    // Upper right
-	  float grad2 = iverf*grad1;
-	  atomicAdd(buffer + p1+8, iangf*grad2);
-	  atomicAdd(buffer + p2+8,  angf*grad2);
-	}
-	if (y<=13) {   // Lower right
-	  float grad2 = verf*grad1;
-	  atomicAdd(buffer + p1+40, iangf*grad2);
-	  atomicAdd(buffer + p2+40,  angf*grad2);
-	}
-      }
-    }
-    __syncthreads();
-    
-    // Normalize twice and suppress peaks first time
-    float sum = buffer[idx]*buffer[idx];
-    for (int i=16;i>0;i/=2)
-      sum += ShiftDown(sum, i);
-    if ((idx&31)==0)
-      sums[idx/32] = sum;
-    __syncthreads();
-    float tsum1 = sums[0] + sums[1] + sums[2] + sums[3]; 
-    tsum1 = min(buffer[idx] * rsqrtf(tsum1), 0.2f);
-     
-    sum = tsum1*tsum1; 
-    for (int i=16;i>0;i/=2)
-      sum += ShiftDown(sum, i);
-    if ((idx&31)==0)
-      sums[idx/32] = sum;
-    __syncthreads();
-    
-    float tsum2 = sums[0] + sums[1] + sums[2] + sums[3];
-    float *desc = d_sift[bx].data;
-    desc[idx] = tsum1 * rsqrtf(tsum2);
-    if (idx==0) {
-      d_sift[bx].xpos *= subsampling;
-      d_sift[bx].ypos *= subsampling;
-      d_sift[bx].scale *= subsampling;
-    }
-    __syncthreads();
-  }
-}
- 
-
-__global__ void ExtractSiftDescriptorsCONST(cudaTextureObject_t texObj, SiftPoint *d_sift, float subsampling, int octave)
-{
-  __shared__ float gauss[16];
-  __shared__ float buffer[128];
-  __shared__ float sums[4];
-
-  const int tx = threadIdx.x; // 0 -> 16
-  const int ty = threadIdx.y; // 0 -> 8
-  const int idx = ty*16 + tx;
-  if (ty==0)
-    gauss[tx] = exp(-(tx-7.5f)*(tx-7.5f)/128.0f);
-
-  int fstPts = min(d_PointCounter[2*octave-1], d_MaxNumPoints);
-  int totPts = min(d_PointCounter[2*octave+1], d_MaxNumPoints);
-  //if (tx==0 && ty==0)
-  //  printf("%d %d %d %d\n", octave, fstPts, min(d_PointCounter[2*octave], d_MaxNumPoints), totPts); 
-  for (int bx = blockIdx.x + fstPts; bx < totPts; bx += gridDim.x) {
-    
-    buffer[idx] = 0.0f;
-    __syncthreads();
-
-    // Compute angles and gradients
-    float theta = 2.0f*3.1415f/360.0f*d_sift[bx].orientation;
-    float sina = sinf(theta);           // cosa -sina
-    float cosa = cosf(theta);           // sina  cosa
-    float scale = 12.0f/16.0f*d_sift[bx].scale;
-    float ssina = scale*sina; 
-    float scosa = scale*cosa;
-    
-    for (int y=ty;y<16;y+=8) {
-      float xpos = d_sift[bx].xpos + (tx-7.5f)*scosa - (y-7.5f)*ssina + 0.5f; 
-      float ypos = d_sift[bx].ypos + (tx-7.5f)*ssina + (y-7.5f)*scosa + 0.5f;
-      float dx = tex2D<float>(texObj, xpos+cosa, ypos+sina) - 
-	tex2D<float>(texObj, xpos-cosa, ypos-sina);
-      float dy = tex2D<float>(texObj, xpos-sina, ypos+cosa) - 
-	tex2D<float>(texObj, xpos+sina, ypos-cosa);
-      float grad = gauss[y]*gauss[tx] * sqrtf(dx*dx + dy*dy);
-      float angf = 4.0f/3.1415f*atan2f(dy, dx) + 4.0f;
-      
-      int hori = (tx + 2)/4 - 1;      // Convert from (tx,y,angle) to bins      
-      float horf = (tx - 1.5f)/4.0f - hori;
-      float ihorf = 1.0f - horf;           
-      int veri = (y + 2)/4 - 1;
-      float verf = (y - 1.5f)/4.0f - veri;
-      float iverf = 1.0f - verf;
-      int angi = angf;
-      int angp = (angi<7 ? angi+1 : 0);
-      angf -= angi;
-      float iangf = 1.0f - angf;
-      
-      int hist = 8*(4*veri + hori);   // Each gradient measure is interpolated 
-      int p1 = angi + hist;           // in angles, xpos and ypos -> 8 stores
-      int p2 = angp + hist;
-      if (tx>=2) { 
-	float grad1 = ihorf*grad;
-	if (y>=2) {   // Upper left
-	  float grad2 = iverf*grad1;
-	  atomicAdd(buffer + p1, iangf*grad2);
-	  atomicAdd(buffer + p2,  angf*grad2);
-	}
-	if (y<=13) {  // Lower left
-	  float grad2 = verf*grad1;
-	  atomicAdd(buffer + p1+32, iangf*grad2); 
-	  atomicAdd(buffer + p2+32,  angf*grad2);
-	}
-      }
-      if (tx<=13) { 
-	float grad1 = horf*grad;
-	if (y>=2) {    // Upper right
-	  float grad2 = iverf*grad1;
-	  atomicAdd(buffer + p1+8, iangf*grad2);
-	  atomicAdd(buffer + p2+8,  angf*grad2);
-	}
-	if (y<=13) {   // Lower right
-	  float grad2 = verf*grad1;
-	  atomicAdd(buffer + p1+40, iangf*grad2);
-	  atomicAdd(buffer + p2+40,  angf*grad2);
-	}
-      }
-    }
-    __syncthreads();
-    
-    // Normalize twice and suppress peaks first time
-    float sum = buffer[idx]*buffer[idx];
-    for (int i=16;i>0;i/=2)
-      sum += ShiftDown(sum, i);
-    if ((idx&31)==0)
-      sums[idx/32] = sum;
-    __syncthreads();
-    float tsum1 = sums[0] + sums[1] + sums[2] + sums[3]; 
-    tsum1 = min(buffer[idx] * rsqrtf(tsum1), 0.2f);
-     
-    sum = tsum1*tsum1; 
-    for (int i=16;i>0;i/=2)
-      sum += ShiftDown(sum, i);
-    if ((idx&31)==0)
-      sums[idx/32] = sum;
-    __syncthreads();
-    
-    float tsum2 = sums[0] + sums[1] + sums[2] + sums[3];
-    float *desc = d_sift[bx].data;
-    desc[idx] = tsum1 * rsqrtf(tsum2);
-    if (idx==0) {
-      d_sift[bx].xpos *= subsampling;
-      d_sift[bx].ypos *= subsampling;
-      d_sift[bx].scale *= subsampling;
-    }
-    __syncthreads();
-  }
-}
- 
-
-__global__ void ExtractSiftDescriptorsOld(cudaTextureObject_t texObj, SiftPoint *d_sift, int fstPts, float subsampling)
-{
-  __shared__ float gauss[16];
-  __shared__ float buffer[128];
-  __shared__ float sums[128];
-
-  const int tx = threadIdx.x; // 0 -> 16
-  const int ty = threadIdx.y; // 0 -> 8
-  const int idx = ty*16 + tx;
-  const int bx = blockIdx.x + fstPts;  // 0 -> numPts
-  if (ty==0)
-    gauss[tx] = exp(-(tx-7.5f)*(tx-7.5f)/128.0f);
-  buffer[idx] = 0.0f;
-  __syncthreads();
-
-  // Compute angles and gradients
-  float theta = 2.0f*3.1415f/360.0f*d_sift[bx].orientation;
-  float sina = sinf(theta);           // cosa -sina
-  float cosa = cosf(theta);           // sina  cosa
-  float scale = 12.0f/16.0f*d_sift[bx].scale;
-  float ssina = scale*sina; 
-  float scosa = scale*cosa;
-
-  for (int y=ty;y<16;y+=8) {
-    float xpos = d_sift[bx].xpos + (tx-7.5f)*scosa - (y-7.5f)*ssina + 0.5f;
-    float ypos = d_sift[bx].ypos + (tx-7.5f)*ssina + (y-7.5f)*scosa + 0.5f;
-    float dx = tex2D<float>(texObj, xpos+cosa, ypos+sina) - 
-      tex2D<float>(texObj, xpos-cosa, ypos-sina);
-    float dy = tex2D<float>(texObj, xpos-sina, ypos+cosa) - 
-      tex2D<float>(texObj, xpos+sina, ypos-cosa);
-    float grad = gauss[y]*gauss[tx] * sqrtf(dx*dx + dy*dy);
-    float angf = 4.0f/3.1415f*atan2f(dy, dx) + 4.0f;
-    
-    int hori = (tx + 2)/4 - 1;      // Convert from (tx,y,angle) to bins      
-    float horf = (tx - 1.5f)/4.0f - hori;  
-    float ihorf = 1.0f - horf;           
-    int veri = (y + 2)/4 - 1;
-    float verf = (y - 1.5f)/4.0f - veri;
-    float iverf = 1.0f - verf;
-    int angi = angf;
-    int angp = (angi<7 ? angi+1 : 0);
-    angf -= angi;
-    float iangf = 1.0f - angf;
-    
-    int hist = 8*(4*veri + hori);   // Each gradient measure is interpolated 
-    int p1 = angi + hist;           // in angles, xpos and ypos -> 8 stores
-    int p2 = angp + hist;
-    if (tx>=2) { 
-      float grad1 = ihorf*grad;
-      if (y>=2) {   // Upper left
-        float grad2 = iverf*grad1;
-	atomicAdd(buffer + p1, iangf*grad2);
-	atomicAdd(buffer + p2,  angf*grad2);
-      }
-      if (y<=13) {  // Lower left
-        float grad2 = verf*grad1;
-	atomicAdd(buffer + p1+32, iangf*grad2); 
-	atomicAdd(buffer + p2+32,  angf*grad2);
-      }
-    }
-    if (tx<=13) { 
-      float grad1 = horf*grad;
-      if (y>=2) {    // Upper right
-        float grad2 = iverf*grad1;
-	atomicAdd(buffer + p1+8, iangf*grad2);
-	atomicAdd(buffer + p2+8,  angf*grad2);
-      }
-      if (y<=13) {   // Lower right
-        float grad2 = verf*grad1;
-	atomicAdd(buffer + p1+40, iangf*grad2);
-	atomicAdd(buffer + p2+40,  angf*grad2);
-      }
-    }
-  }
-  __syncthreads();
-
-  // Normalize twice and suppress peaks first time
-  if (idx<64)
-    sums[idx] = buffer[idx]*buffer[idx] + buffer[idx+64]*buffer[idx+64];
-  __syncthreads();      
-  if (idx<32) sums[idx] = sums[idx] + sums[idx+32];
-  __syncthreads();      
-  if (idx<16) sums[idx] = sums[idx] + sums[idx+16];
-  __syncthreads();      
-  if (idx<8)  sums[idx] = sums[idx] + sums[idx+8];
-  __syncthreads();      
-  if (idx<4)  sums[idx] = sums[idx] + sums[idx+4];
-  __syncthreads();      
-  float tsum1 = sums[0] + sums[1] + sums[2] + sums[3]; 
-  buffer[idx] = buffer[idx] * rsqrtf(tsum1);
-
-  if (buffer[idx]>0.2f)
-    buffer[idx] = 0.2f;
-  __syncthreads();
-  if (idx<64)
-    sums[idx] = buffer[idx]*buffer[idx] + buffer[idx+64]*buffer[idx+64];
-  __syncthreads();      
-  if (idx<32) sums[idx] = sums[idx] + sums[idx+32];
-  __syncthreads();      
-  if (idx<16) sums[idx] = sums[idx] + sums[idx+16];
-  __syncthreads();      
-  if (idx<8)  sums[idx] = sums[idx] + sums[idx+8];
-  __syncthreads();      
-  if (idx<4)  sums[idx] = sums[idx] + sums[idx+4];
-  __syncthreads();      
-  float tsum2 = sums[0] + sums[1] + sums[2] + sums[3]; 
-
-  float *desc = d_sift[bx].data;
-  desc[idx] = buffer[idx] * rsqrtf(tsum2);
-  if (idx==0) {
-    d_sift[bx].xpos *= subsampling;
-    d_sift[bx].ypos *= subsampling;
-    d_sift[bx].scale *= subsampling;
-  }
-}
-
-
-__device__ void ExtractSiftDescriptor(cudaTextureObject_t texObj, SiftPoint *d_sift, float subsampling, int octave, int bx)
-{
-  __shared__ float gauss[16];
-  __shared__ float buffer[128];
-  __shared__ float sums[4];
-
-  const int idx = threadIdx.x;
-  const int tx = idx & 15; // 0 -> 16
-  const int ty = idx / 16; // 0 -> 8
-  if (ty==0)
-    gauss[tx] = exp(-(tx-7.5f)*(tx-7.5f)/128.0f);
-  buffer[idx] = 0.0f;
-  __syncthreads();
-
-  // Compute angles and gradients
-  float theta = 2.0f*3.1415f/360.0f*d_sift[bx].orientation;
-  float sina = sinf(theta);           // cosa -sina
-  float cosa = cosf(theta);           // sina  cosa
-  float scale = 12.0f/16.0f*d_sift[bx].scale;
-  float ssina = scale*sina; 
-  float scosa = scale*cosa;
-  
-  for (int y=ty;y<16;y+=8) {
-    float xpos = d_sift[bx].xpos + (tx-7.5f)*scosa - (y-7.5f)*ssina + 0.5f;
-    float ypos = d_sift[bx].ypos + (tx-7.5f)*ssina + (y-7.5f)*scosa + 0.5f;
-    float dx = tex2D<float>(texObj, xpos+cosa, ypos+sina) - 
-      tex2D<float>(texObj, xpos-cosa, ypos-sina);
-    float dy = tex2D<float>(texObj, xpos-sina, ypos+cosa) - 
-      tex2D<float>(texObj, xpos+sina, ypos-cosa);
-    float grad = gauss[y]*gauss[tx] * sqrtf(dx*dx + dy*dy);
-    float angf = 4.0f/3.1415f*atan2f(dy, dx) + 4.0f;
-    
-    int hori = (tx + 2)/4 - 1;      // Convert from (tx,y,angle) to bins      
-    float horf = (tx - 1.5f)/4.0f - hori;
-    float ihorf = 1.0f - horf;           
-    int veri = (y + 2)/4 - 1;
-    float verf = (y - 1.5f)/4.0f - veri;
-    float iverf = 1.0f - verf;
-    int angi = angf;
-    int angp = (angi<7 ? angi+1 : 0);
-    angf -= angi;
-    float iangf = 1.0f - angf;
-    
-    int hist = 8*(4*veri + hori);   // Each gradient measure is interpolated 
-    int p1 = angi + hist;           // in angles, xpos and ypos -> 8 stores
-    int p2 = angp + hist;
-    if (tx>=2) { 
-      float grad1 = ihorf*grad;
-      if (y>=2) {   // Upper left
-	float grad2 = iverf*grad1;
-	atomicAdd(buffer + p1, iangf*grad2);
-	atomicAdd(buffer + p2,  angf*grad2);
-      }
-      if (y<=13) {  // Lower left
-	float grad2 = verf*grad1;
-	atomicAdd(buffer + p1+32, iangf*grad2); 
-	atomicAdd(buffer + p2+32,  angf*grad2);
-      }
-    }
-    if (tx<=13) { 
-      float grad1 = horf*grad;
-      if (y>=2) {    // Upper right
-	float grad2 = iverf*grad1;
-	atomicAdd(buffer + p1+8, iangf*grad2);
-	atomicAdd(buffer + p2+8,  angf*grad2);
-      }
-      if (y<=13) {   // Lower right
-	float grad2 = verf*grad1;
-	atomicAdd(buffer + p1+40, iangf*grad2);
-	atomicAdd(buffer + p2+40,  angf*grad2);
-      }
-    }
-  }
-  __syncthreads();
-    
-  // Normalize twice and suppress peaks first time
-  float sum = buffer[idx]*buffer[idx];
-  for (int i=16;i>0;i/=2)
-    sum += ShiftDown(sum, i);
-  if ((idx&31)==0)
-    sums[idx/32] = sum;
-  __syncthreads();
-  float tsum1 = sums[0] + sums[1] + sums[2] + sums[3]; 
-  tsum1 = min(buffer[idx] * rsqrtf(tsum1), 0.2f);
-  
-  sum = tsum1*tsum1; 
-  for (int i=16;i>0;i/=2)
-    sum += ShiftDown(sum, i);
-  if ((idx&31)==0)
-    sums[idx/32] = sum;
-  __syncthreads();
-  
-  float tsum2 = sums[0] + sums[1] + sums[2] + sums[3];
-  float *desc = d_sift[bx].data;
-  desc[idx] = tsum1 * rsqrtf(tsum2);
-  if (idx==0) {
-    d_sift[bx].xpos *= subsampling;
-    d_sift[bx].ypos *= subsampling;
-    d_sift[bx].scale *= subsampling;
-  }
-  __syncthreads();
-}
-
-
-__global__ void RescalePositions(SiftPoint *d_sift, int numPts, float scale)
-{
-  int num = blockIdx.x*blockDim.x + threadIdx.x;
-  if (num<numPts) {
-    d_sift[num].xpos *= scale;
-    d_sift[num].ypos *= scale;
-    d_sift[num].scale *= scale;
-  }
-}
-
-
-__global__ void ComputeOrientations(cudaTextureObject_t texObj, SiftPoint *d_Sift, int fstPts)
-{
-  __shared__ float hist[64];
-  __shared__ float gauss[11];
-  const int tx = threadIdx.x;
-  const int bx = blockIdx.x + fstPts;
-  float i2sigma2 = -1.0f/(4.5f*d_Sift[bx].scale*d_Sift[bx].scale);
-  if (tx<11) 
-    gauss[tx] = exp(i2sigma2*(tx-5)*(tx-5));
-  if (tx<64)
-    hist[tx] = 0.0f;
-  __syncthreads();
-  float xp = d_Sift[bx].xpos - 4.5f;
-  float yp = d_Sift[bx].ypos - 4.5f;
-  int yd = tx/11;
-  int xd = tx - yd*11;
-  float xf = xp + xd;
-  float yf = yp + yd;
-  if (yd<11) {
-    float dx = tex2D<float>(texObj, xf+1.0, yf) - tex2D<float>(texObj, xf-1.0, yf); 
-    float dy = tex2D<float>(texObj, xf, yf+1.0) - tex2D<float>(texObj, xf, yf-1.0); 
-    int bin = 16.0f*atan2f(dy, dx)/3.1416f + 16.5f;
-    if (bin>31)
-      bin = 0;
-    float grad = sqrtf(dx*dx + dy*dy);
-    atomicAdd(&hist[bin], grad*gauss[xd]*gauss[yd]);
-  }
-  __syncthreads();
-  int x1m = (tx>=1 ? tx-1 : tx+31);
-  int x1p = (tx<=30 ? tx+1 : tx-31);
-  if (tx<32) {
-    int x2m = (tx>=2 ? tx-2 : tx+30);
-    int x2p = (tx<=29 ? tx+2 : tx-30);
-    hist[tx+32] = 6.0f*hist[tx] + 4.0f*(hist[x1m] + hist[x1p]) + (hist[x2m] + hist[x2p]);
-  }
-  __syncthreads();
-  if (tx<32) {
-    float v = hist[32+tx];
-    hist[tx] = (v>hist[32+x1m] && v>=hist[32+x1p] ? v : 0.0f);
-  }
-  __syncthreads();
-  if (tx==0) {
-    float maxval1 = 0.0;
-    float maxval2 = 0.0;
-    int i1 = -1;
-    int i2 = -1;
-    for (int i=0;i<32;i++) {
-      float v = hist[i];
-      if (v>maxval1) {
-	maxval2 = maxval1;
-	maxval1 = v;
-	i2 = i1;
-	i1 = i;
-      } else if (v>maxval2) {
-	maxval2 = v;
-	i2 = i;
-      }
-    }
-    float val1 = hist[32+((i1+1)&31)];
-    float val2 = hist[32+((i1+31)&31)];
-    float peak = i1 + 0.5f*(val1-val2) / (2.0f*maxval1-val1-val2);
-    d_Sift[bx].orientation = 11.25f*(peak<0.0f ? peak+32.0f : peak);
-    if (maxval2>0.8f*maxval1) {
-      float val1 = hist[32+((i2+1)&31)];
-      float val2 = hist[32+((i2+31)&31)];
-      float peak = i2 + 0.5f*(val1-val2) / (2.0f*maxval2-val1-val2);
-      unsigned int idx = atomicInc(d_PointCounter, 0x7fffffff);
-      if (idx<d_MaxNumPoints) {
-	d_Sift[idx].xpos = d_Sift[bx].xpos;
-	d_Sift[idx].ypos = d_Sift[bx].ypos;
-	d_Sift[idx].scale = d_Sift[bx].scale;
-	d_Sift[idx].sharpness = d_Sift[bx].sharpness;
-	d_Sift[idx].edgeness = d_Sift[bx].edgeness;
-	d_Sift[idx].orientation = 11.25f*(peak<0.0f ? peak+32.0f : peak);;
-	d_Sift[idx].subsampling = d_Sift[bx].subsampling;
-      }
-    } 
-  }
-} 
-
-// With constant number of blocks
-__global__ void ComputeOrientationsCONSTNew(float *image, int w, int p, int h, SiftPoint *d_Sift, int octave)
-{
-#define RAD 9
-#define WID (2*RAD + 1)
-#define LEN 32                                   //%%%% Note: Lowe suggests 36, not 32
-  __shared__ float img[WID][WID], tmp[WID][WID];
-  __shared__ float hist[2*LEN];
-  __shared__ float gaussx[WID], gaussy[WID];
-  const int tx = threadIdx.x;
-  
-  int fstPts = min(d_PointCounter[2*octave-1], d_MaxNumPoints);
-  int totPts = min(d_PointCounter[2*octave+0], d_MaxNumPoints);  
-  for (int bx = blockIdx.x + fstPts; bx < totPts; bx += gridDim.x) {
-
-    float sc = d_Sift[bx].scale;
-    for (int i=tx;i<2*LEN;i+=blockDim.x)
-      hist[i] = 0.0f;
-    float xp = d_Sift[bx].xpos;
-    float yp = d_Sift[bx].ypos;
-    int xi = (int)xp;
-    int yi = (int)yp;
-    float xf = xp - xi;
-    float yf = yp - yi;
-    for (int i=tx;i<WID*WID;i+=blockDim.x) {
-      int y = i/WID;
-      int x = i - y*WID;
-      int xp = max(min(x - RAD + xi, w - 1), 0);
-      int yp = max(min(y - RAD + yi, h - 1), 0);
-      img[y][x] = image[yp*p + xp];
-    }
-    float fac[5];
-    fac[1] = fac[3] = (sc>0.5f ? __expf(-1.0f/(2.0f*(sc*sc - 0.25f))) : 0.0f);
-    fac[0] = fac[4] = (sc>0.5f ? __expf(-4.0f/(2.0f*(sc*sc - 0.25f))) : 0.0f);
-    fac[2] = 1.0f;
-    float i2sigma2 = -1.0f/(2.0f*2.0f*2.0f*sc*sc); //%%%% Note: Lowe suggests 1.5, not 2.0
-    if (tx<WID) {
-      gaussx[tx] = __expf(i2sigma2*(tx-RAD-xf)*(tx-RAD-xf));
-      gaussy[tx] = __expf(i2sigma2*(tx-RAD-yf)*(tx-RAD-yf));
-    }
-    __syncthreads();
-    for (int i=tx;i<(WID-4)*WID;i+=blockDim.x) {
-      int y = i/WID;
-      int x = i - y*WID;
-      y += 2;
-      tmp[y][x] = img[y][x] + fac[1]*(img[y-1][x] + img[y+1][x]) +
-	fac[0]*(img[y-2][x] + img[y+2][x]);
-    }
-    __syncthreads();
-    for (int i=tx;i<(WID-4)*(WID-4);i+=blockDim.x) {
-      int y = i/(WID-4);
-      int x = i - y*(WID-4);
-      x += 2;
-      y += 2;
-      img[y][x] = tmp[y][x] + fac[1]*(tmp[y][x-1] + tmp[y][x+1]) +
-	fac[0]*(tmp[y][x-2] + tmp[y][x+2]);
-    }
-    __syncthreads();
-    for (int i=tx;i<(WID-6)*(WID-6);i+=blockDim.x) {
-      int y = i/(WID-6);
-      int x = i - y*(WID-6);
-      x += 3;
-      y += 3;
-      float dx = img[y][x+1] - img[y][x-1];
-      float dy = img[y+1][x] - img[y-1][x];
-      int bin = (int)((LEN/2)*atan2f(dy, dx)/3.1416f + (LEN/2) + 0.5f)%LEN;
-      float grad = __fsqrt_rn(dx*dx + dy*dy);
-      atomicAdd(&hist[LEN + bin], grad*gaussx[x]*gaussy[y]); 
-    }
-    __syncthreads();
-    int x1m = (tx>=1 ? tx-1 : tx+LEN-1);
-    int x1p = (tx<(LEN-1) ? tx+1 : tx-LEN+1);
-    int x2m = (tx>=2 ? tx-2 : tx+LEN-2);
-    int x2p = (tx<(LEN-2) ? tx+2 : tx-LEN+2);
-    if (tx<LEN) {
-      hist[tx] = 6.0f*hist[tx + LEN] + 4.0f*(hist[x1m + LEN] + hist[x1p + LEN]) +
-	1.0f*(hist[x2m + LEN] + hist[x2p + LEN]);
-      hist[tx + LEN] = 8.0f*hist[tx] + 4.0f*(hist[x1m] + hist[x1p]) +
-	0.0f*(hist[x2m] + hist[x2p]);
-      float val = hist[tx + LEN];
-      hist[tx] = (val>hist[x1m + LEN] && val>=hist[x1p + LEN] ? val : 0.0f);
-    }
-    __syncthreads();
-    if (tx==0) {
-      float maxval1 = 0.0;
-      float maxval2 = 0.0;
-      int i1 = -1;
-      int i2 = -1;
-      for (int i=0;i<LEN;i++) {
-	float v = hist[i];
-	if (v>maxval1) {
-	  maxval2 = maxval1;
-	  maxval1 = v;
-	  i2 = i1;
-	  i1 = i;
-	} else if (v>maxval2) {
-	  maxval2 = v;
-	  i2 = i;
-	}
-      }
-      float val1 = hist[LEN + ((i1 + 1)%LEN)];
-      float val2 = hist[LEN + ((i1 + LEN - 1)%LEN)];
-      float peak = i1 + 0.5f*(val1 - val2) / (2.0f*maxval1 - val1 - val2);
-      d_Sift[bx].orientation = 360.0f*(peak<0.0f ? peak + LEN : peak)/LEN;
-      atomicMax(&d_PointCounter[2*octave+1], d_PointCounter[2*octave+0]); 
-      if (maxval2>0.8f*maxval1 && true) {
-	float val1 = hist[LEN + ((i2 + 1)%LEN)];
-	float val2 = hist[LEN + ((i2 + LEN - 1)%LEN)];
-	float peak = i2 + 0.5f*(val1 - val2) / (2.0f*maxval2 - val1 - val2);
-	unsigned int idx = atomicInc(&d_PointCounter[2*octave+1], 0x7fffffff);
-	if (idx<d_MaxNumPoints) {
-	  d_Sift[idx].xpos = d_Sift[bx].xpos;
-	  d_Sift[idx].ypos = d_Sift[bx].ypos;
-	  d_Sift[idx].scale = sc;
-	  d_Sift[idx].sharpness = d_Sift[bx].sharpness;
-	  d_Sift[idx].edgeness = d_Sift[bx].edgeness;
-	  d_Sift[idx].orientation = 360.0f*(peak<0.0f ? peak + LEN : peak)/LEN;
-	  d_Sift[idx].subsampling = d_Sift[bx].subsampling;
-	}
-      }
-    }
-  }
-#undef RAD
-#undef WID
-#undef LEN
-} 
-
-// With constant number of blocks
-__global__ void ComputeOrientationsCONST(cudaTextureObject_t texObj, SiftPoint *d_Sift, int octave)
-{
-  __shared__ float hist[64];
-  __shared__ float gauss[11];
-  const int tx = threadIdx.x;
-  
-  int fstPts = min(d_PointCounter[2*octave-1], d_MaxNumPoints);
-  int totPts = min(d_PointCounter[2*octave+0], d_MaxNumPoints);  
-  for (int bx = blockIdx.x + fstPts; bx < totPts; bx += gridDim.x) {
- 
-    float i2sigma2 = -1.0f/(2.0f*1.5f*1.5f*d_Sift[bx].scale*d_Sift[bx].scale);
-    if (tx<11) 
-      gauss[tx] = exp(i2sigma2*(tx-5)*(tx-5));
-    if (tx<64)
-      hist[tx] = 0.0f;
-    __syncthreads();
-    float xp = d_Sift[bx].xpos - 4.5f;
-    float yp = d_Sift[bx].ypos - 4.5f;
-    int yd = tx/11;
-    int xd = tx - yd*11;
-    float xf = xp + xd;
-    float yf = yp + yd;
-    if (yd<11) {
-      float dx = tex2D<float>(texObj, xf+1.0, yf) - tex2D<float>(texObj, xf-1.0, yf); 
-      float dy = tex2D<float>(texObj, xf, yf+1.0) - tex2D<float>(texObj, xf, yf-1.0); 
-      int bin = 16.0f*atan2f(dy, dx)/3.1416f + 16.5f;
-      if (bin>31)
-	bin = 0;
-      float grad = sqrtf(dx*dx + dy*dy);
-      atomicAdd(&hist[bin], grad*gauss[xd]*gauss[yd]);
-    }
-    __syncthreads();
-    int x1m = (tx>=1 ? tx-1 : tx+31);
-    int x1p = (tx<=30 ? tx+1 : tx-31);
-    if (tx<32) {
-      int x2m = (tx>=2 ? tx-2 : tx+30);
-      int x2p = (tx<=29 ? tx+2 : tx-30);
-      hist[tx+32] = 6.0f*hist[tx] + 4.0f*(hist[x1m] + hist[x1p]) + (hist[x2m] + hist[x2p]);
-    }
-    __syncthreads();
-    if (tx<32) {
-      float v = hist[32+tx];
-      hist[tx] = (v>hist[32+x1m] && v>=hist[32+x1p] ? v : 0.0f);
-    }
-    __syncthreads();
-    if (tx==0) {
-      float maxval1 = 0.0;
-      float maxval2 = 0.0;
-      int i1 = -1;
-      int i2 = -1;
-      for (int i=0;i<32;i++) {
-	float v = hist[i];
-	if (v>maxval1) {
-	  maxval2 = maxval1;
-	  maxval1 = v;
-	  i2 = i1;
-	  i1 = i;
-	} else if (v>maxval2) {
-	  maxval2 = v;
-	  i2 = i;
-	}
-      }
-      float val1 = hist[32+((i1+1)&31)];
-      float val2 = hist[32+((i1+31)&31)];
-      float peak = i1 + 0.5f*(val1-val2) / (2.0f*maxval1-val1-val2);
-      d_Sift[bx].orientation = 11.25f*(peak<0.0f ? peak+32.0f : peak);
-      atomicMax(&d_PointCounter[2*octave+1], d_PointCounter[2*octave+0]); 
-      if (maxval2>0.8f*maxval1 && true) {
-	float val1 = hist[32+((i2+1)&31)];
-	float val2 = hist[32+((i2+31)&31)];
-	float peak = i2 + 0.5f*(val1-val2) / (2.0f*maxval2-val1-val2);
-	unsigned int idx = atomicInc(&d_PointCounter[2*octave+1], 0x7fffffff);
-	if (idx<d_MaxNumPoints) {
-	  d_Sift[idx].xpos = d_Sift[bx].xpos;
-	  d_Sift[idx].ypos = d_Sift[bx].ypos;
-	  d_Sift[idx].scale = d_Sift[bx].scale;
-	  d_Sift[idx].sharpness = d_Sift[bx].sharpness;
-	  d_Sift[idx].edgeness = d_Sift[bx].edgeness;
-	  d_Sift[idx].orientation = 11.25f*(peak<0.0f ? peak+32.0f : peak);;
-	  d_Sift[idx].subsampling = d_Sift[bx].subsampling;
-	}
-      }
-    }
-    __syncthreads();
-  }
-} 
-
-// With constant number of blocks
-__global__ void OrientAndExtractCONST(cudaTextureObject_t texObj, SiftPoint *d_Sift, float subsampling, int octave)
-{
-  __shared__ float hist[64];
-  __shared__ float gauss[11];
-  __shared__ unsigned int idx; //%%%%
-  const int tx = threadIdx.x;
-  
-  int fstPts = min(d_PointCounter[2*octave-1], d_MaxNumPoints);
-  int totPts = min(d_PointCounter[2*octave+0], d_MaxNumPoints);  
-  for (int bx = blockIdx.x + fstPts; bx < totPts; bx += gridDim.x) {
- 
-    float i2sigma2 = -1.0f/(4.5f*d_Sift[bx].scale*d_Sift[bx].scale);
-    if (tx<11) 
-      gauss[tx] = exp(i2sigma2*(tx-5)*(tx-5));
-    if (tx<64)
-      hist[tx] = 0.0f;
-    __syncthreads();
-    float xp = d_Sift[bx].xpos - 4.5f;
-    float yp = d_Sift[bx].ypos - 4.5f;
-    int yd = tx/11;
-    int xd = tx - yd*11;
-    float xf = xp + xd;
-    float yf = yp + yd;
-    if (yd<11) {
-      float dx = tex2D<float>(texObj, xf+1.0, yf) - tex2D<float>(texObj, xf-1.0, yf); 
-      float dy = tex2D<float>(texObj, xf, yf+1.0) - tex2D<float>(texObj, xf, yf-1.0); 
-      int bin = 16.0f*atan2f(dy, dx)/3.1416f + 16.5f;
-      if (bin>31)
-	bin = 0;
-      float grad = sqrtf(dx*dx + dy*dy);
-      atomicAdd(&hist[bin], grad*gauss[xd]*gauss[yd]);
-    }
-    __syncthreads();
-    int x1m = (tx>=1 ? tx-1 : tx+31);
-    int x1p = (tx<=30 ? tx+1 : tx-31);
-    if (tx<32) {
-      int x2m = (tx>=2 ? tx-2 : tx+30);
-      int x2p = (tx<=29 ? tx+2 : tx-30);
-      hist[tx+32] = 6.0f*hist[tx] + 4.0f*(hist[x1m] + hist[x1p]) + (hist[x2m] + hist[x2p]);
-    }
-    __syncthreads();
-    if (tx<32) {
-      float v = hist[32+tx];
-      hist[tx] = (v>hist[32+x1m] && v>=hist[32+x1p] ? v : 0.0f);
-    }
-    __syncthreads();
-    if (tx==0) {
-      float maxval1 = 0.0;
-      float maxval2 = 0.0;
-      int i1 = -1;
-      int i2 = -1;
-      for (int i=0;i<32;i++) {
-	float v = hist[i];
-	if (v>maxval1) {
-	  maxval2 = maxval1;
-	  maxval1 = v;
-	  i2 = i1;
-	  i1 = i;
-	} else if (v>maxval2) {
-	  maxval2 = v;
-	  i2 = i;
-	}
-      }
-      float val1 = hist[32+((i1+1)&31)];
-      float val2 = hist[32+((i1+31)&31)];
-      float peak = i1 + 0.5f*(val1-val2) / (2.0f*maxval1-val1-val2);
-      d_Sift[bx].orientation = 11.25f*(peak<0.0f ? peak+32.0f : peak);
-      idx = 0xffffffff; //%%%%
-      atomicMax(&d_PointCounter[2*octave+1], d_PointCounter[2*octave+0]); 
-      if (maxval2>0.8f*maxval1) {
-	float val1 = hist[32+((i2+1)&31)];
-	float val2 = hist[32+((i2+31)&31)];
-	float peak = i2 + 0.5f*(val1-val2) / (2.0f*maxval2-val1-val2);
-	idx = atomicInc(&d_PointCounter[2*octave+1], 0x7fffffff); //%%%%
-	if (idx<d_MaxNumPoints) {
-	  d_Sift[idx].xpos = d_Sift[bx].xpos;
-	  d_Sift[idx].ypos = d_Sift[bx].ypos;
-	  d_Sift[idx].scale = d_Sift[bx].scale;
-	  d_Sift[idx].sharpness = d_Sift[bx].sharpness;
-	  d_Sift[idx].edgeness = d_Sift[bx].edgeness;
-	  d_Sift[idx].orientation = 11.25f*(peak<0.0f ? peak+32.0f : peak);;
-	  d_Sift[idx].subsampling = d_Sift[bx].subsampling;
-	}
-      }
-    }
-    __syncthreads();
-    ExtractSiftDescriptor(texObj, d_Sift, subsampling, octave, bx); //%%%%
-    if (idx<d_MaxNumPoints) //%%%%
-      ExtractSiftDescriptor(texObj, d_Sift, subsampling, octave, idx); //%%%%
-  }
-} 
-
-
-///////////////////////////////////////////////////////////////////////////////
-// Subtract two images (multi-scale version)
-///////////////////////////////////////////////////////////////////////////////
-  
-__global__ void FindPointsMultiTest(float *d_Data0, SiftPoint *d_Sift, int width, int pitch, int height, float subsampling, float lowestScale, float thresh, float factor, float edgeLimit, int octave)
-{
-  #define MEMWID (MINMAX_W + 2)
-  __shared__ unsigned int cnt;
-  __shared__ unsigned short points[3*MEMWID];
-
-  if (blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0) {
-    atomicMax(&d_PointCounter[2*octave+0], d_PointCounter[2*octave-1]); 
-    atomicMax(&d_PointCounter[2*octave+1], d_PointCounter[2*octave-1]);
-  }
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  if (tx==0 && ty==0)
-    cnt = 0; 
-  __syncthreads();
-
-  int ypos = MINMAX_H*blockIdx.y + ty;
-  if (ypos>=height)
-    return;
-  int block = blockIdx.x/NUM_SCALES; 
-  int scale = blockIdx.x - NUM_SCALES*block;
-  int minx = block*MINMAX_W;
-  int maxx = min(minx + MINMAX_W, width);
-  int xpos = minx + tx;
-  int size = pitch*height;
-  int ptr = size*scale + max(min(xpos-1, width-1), 0);
-
-  float maxv = fabs(d_Data0[ptr + ypos*pitch + 1*size]);
-  maxv = fmaxf(maxv, ShiftDown(maxv, 16, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 8, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 4, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 2, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 1, MINMAX_W));
-    
-  if (Shuffle(maxv, 0)>thresh) {
-    int yptr1 = ptr + ypos*pitch;
-    int yptr0 = ptr + max(0,ypos-1)*pitch;
-    int yptr2 = ptr + min(height-1,ypos+1)*pitch;
-    float d20 = d_Data0[yptr0 + 1*size];
-    float d21 = d_Data0[yptr1 + 1*size];
-    float d22 = d_Data0[yptr2 + 1*size];
-    float d31 = d_Data0[yptr1 + 2*size];
-    float d11 = d_Data0[yptr1];
-    
-    float d10 = d_Data0[yptr0];
-    float d12 = d_Data0[yptr2];
-    float ymin1 = fminf(fminf(d10, d11), d12);
-    float ymax1 = fmaxf(fmaxf(d10, d11), d12);
-    float d30 = d_Data0[yptr0 + 2*size];
-    float d32 = d_Data0[yptr2 + 2*size]; 
-    float ymin3 = fminf(fminf(d30, d31), d32);
-    float ymax3 = fmaxf(fmaxf(d30, d31), d32);
-    float ymin2 = fminf(fminf(ymin1, fminf(fminf(d20, d22), d21)), ymin3);
-    float ymax2 = fmaxf(fmaxf(ymax1, fmaxf(fmaxf(d20, d22), d21)), ymax3);
-    
-    float nmin2 = fminf(ShiftUp(ymin2, 1), ShiftDown(ymin2, 1));
-    float nmax2 = fmaxf(ShiftUp(ymax2, 1), ShiftDown(ymax2, 1));
-    if (tx>0 && tx<MINMAX_W+1 && xpos<=maxx) {
-      if (d21<-thresh) {
-	float minv = fminf(fminf(nmin2, ymin1), ymin3);
-	minv = fminf(fminf(minv, d20), d22);
-	if (d21<minv) { 
-	  int pos = atomicInc(&cnt, MEMWID-1);
-	  points[3*pos+0] = xpos - 1;
-	  points[3*pos+1] = ypos;
-	  points[3*pos+2] = scale;
-	}
-      } 
-      if (d21>thresh) {
-	float maxv = fmaxf(fmaxf(nmax2, ymax1), ymax3);
-	maxv = fmaxf(fmaxf(maxv, d20), d22);
-	if (d21>maxv) { 
-	  int pos = atomicInc(&cnt, MEMWID-1);
-	  points[3*pos+0] = xpos - 1;
-	  points[3*pos+1] = ypos;
-	  points[3*pos+2] = scale;
-	}
-      }
-    }
-  }
-  __syncthreads();
-  if (ty==0 && tx<cnt) {
-    int xpos = points[3*tx+0];
-    int ypos = points[3*tx+1];
-    int scale = points[3*tx+2];
-    int ptr = xpos + (ypos + (scale+1)*height)*pitch;
-    float val = d_Data0[ptr];
-    float *data1 = &d_Data0[ptr];
-    float dxx = 2.0f*val - data1[-1] - data1[1];
-    float dyy = 2.0f*val - data1[-pitch] - data1[pitch];
-    float dxy = 0.25f*(data1[+pitch+1] + data1[-pitch-1] - data1[-pitch+1] - data1[+pitch-1]);
-    float tra = dxx + dyy;
-    float det = dxx*dyy - dxy*dxy;
-    if (tra*tra<edgeLimit*det) {
-      float edge = __fdividef(tra*tra, det);
-      float dx = 0.5f*(data1[1] - data1[-1]);
-      float dy = 0.5f*(data1[pitch] - data1[-pitch]); 
-      float *data0 = d_Data0 + ptr - height*pitch;
-      float *data2 = d_Data0 + ptr + height*pitch;
-      float ds = 0.5f*(data0[0] - data2[0]); 
-      float dss = 2.0f*val - data2[0] - data0[0];
-      float dxs = 0.25f*(data2[1] + data0[-1] - data0[1] - data2[-1]);
-      float dys = 0.25f*(data2[pitch] + data0[-pitch] - data2[-pitch] - data0[pitch]);
-      float idxx = dyy*dss - dys*dys;
-      float idxy = dys*dxs - dxy*dss;   
-      float idxs = dxy*dys - dyy*dxs;
-      float idet = __fdividef(1.0f, idxx*dxx + idxy*dxy + idxs*dxs);
-      float idyy = dxx*dss - dxs*dxs;
-      float idys = dxy*dxs - dxx*dys;
-      float idss = dxx*dyy - dxy*dxy;
-      float pdx = idet*(idxx*dx + idxy*dy + idxs*ds);
-      float pdy = idet*(idxy*dx + idyy*dy + idys*ds);
-      float pds = idet*(idxs*dx + idys*dy + idss*ds);
-      if (pdx<-0.5f || pdx>0.5f || pdy<-0.5f || pdy>0.5f || pds<-0.5f || pds>0.5f) {
-	pdx = __fdividef(dx, dxx);
-	pdy = __fdividef(dy, dyy);
-	pds = __fdividef(ds, dss);
-      }
-      float dval = 0.5f*(dx*pdx + dy*pdy + ds*pds);
-      int maxPts = d_MaxNumPoints;
-      float sc = powf(2.0f, (float)scale/NUM_SCALES) * exp2f(pds*factor);
-      if (sc>=lowestScale) {
-	unsigned int idx = atomicInc(&d_PointCounter[2*octave+0], 0x7fffffff);
-	idx = (idx>=maxPts ? maxPts-1 : idx);
-	d_Sift[idx].xpos = xpos + pdx;
-	d_Sift[idx].ypos = ypos + pdy;
-	d_Sift[idx].scale = sc;
-	d_Sift[idx].sharpness = val + dval;
-	d_Sift[idx].edgeness = edge;
-	d_Sift[idx].subsampling = subsampling;
-      }
-    }
-  }
-}
-
-__global__ void FindPointsMultiNew(float *d_Data0, SiftPoint *d_Sift, int width, int pitch, int height, float subsampling, float lowestScale, float thresh, float factor, float edgeLimit, int octave)
-{
-  #define MEMWID (MINMAX_W + 2)
-  __shared__ unsigned short points[2*MEMWID];
-  
-  if (blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0) {
-    atomicMax(&d_PointCounter[2*octave+0], d_PointCounter[2*octave-1]);
-    atomicMax(&d_PointCounter[2*octave+1], d_PointCounter[2*octave-1]);
-  }
-  int tx = threadIdx.x;
-  int block = blockIdx.x/NUM_SCALES; 
-  int scale = blockIdx.x - NUM_SCALES*block;
-  int minx = block*MINMAX_W;
-  int maxx = min(minx + MINMAX_W, width);
-  int xpos = minx + tx;
-  int size = pitch*height;
-  int ptr = size*scale + max(min(xpos-1, width-1), 0);
-
-  int yloops = min(height - MINMAX_H*blockIdx.y, MINMAX_H);
-  float maxv = 0.0f;
-  for (int y=0;y<yloops;y++) {
-    int ypos = MINMAX_H*blockIdx.y + y;
-    int yptr1 = ptr + ypos*pitch;
-    float val = d_Data0[yptr1 + 1*size];
-    maxv = fmaxf(maxv, fabs(val));
-  }
-  //if (tx==0) printf("XXX1\n");
-  if (!__any_sync(0xffffffff, maxv>thresh))
-    return;
-  //if (tx==0) printf("XXX2\n");
-  
-  int ptbits = 0;
-  for (int y=0;y<yloops;y++) {
-
-    int ypos = MINMAX_H*blockIdx.y + y;
-    int yptr1 = ptr + ypos*pitch;
-    float d11 = d_Data0[yptr1 + 1*size];
-    if (__any_sync(0xffffffff, fabs(d11)>thresh)) {
-    
-      int yptr0 = ptr + max(0,ypos-1)*pitch;
-      int yptr2 = ptr + min(height-1,ypos+1)*pitch;
-      float d01 = d_Data0[yptr1];
-      float d10 = d_Data0[yptr0 + 1*size];
-      float d12 = d_Data0[yptr2 + 1*size];
-      float d21 = d_Data0[yptr1 + 2*size];
-      
-      float d00 = d_Data0[yptr0];
-      float d02 = d_Data0[yptr2];
-      float ymin1 = fminf(fminf(d00, d01), d02);
-      float ymax1 = fmaxf(fmaxf(d00, d01), d02);
-      float d20 = d_Data0[yptr0 + 2*size];
-      float d22 = d_Data0[yptr2 + 2*size]; 
-      float ymin3 = fminf(fminf(d20, d21), d22);
-      float ymax3 = fmaxf(fmaxf(d20, d21), d22);
-      float ymin2 = fminf(fminf(ymin1, fminf(fminf(d10, d12), d11)), ymin3);
-      float ymax2 = fmaxf(fmaxf(ymax1, fmaxf(fmaxf(d10, d12), d11)), ymax3);
-      
-      float nmin2 = fminf(ShiftUp(ymin2, 1), ShiftDown(ymin2, 1));
-      float nmax2 = fmaxf(ShiftUp(ymax2, 1), ShiftDown(ymax2, 1));
-      float minv = fminf(fminf(nmin2, ymin1), ymin3);
-      minv = fminf(fminf(minv, d10), d12);
-      float maxv = fmaxf(fmaxf(nmax2, ymax1), ymax3);
-      maxv = fmaxf(fmaxf(maxv, d10), d12);
-      
-      if (tx>0 && tx<MINMAX_W+1 && xpos<=maxx) 
-	ptbits |= ((d11 < fminf(-thresh, minv)) | (d11 > fmaxf(thresh, maxv))) << y;
-    }
-  }
-  
-  unsigned int totbits = __popc(ptbits);
-  unsigned int numbits = totbits;
-  for (int d=1;d<32;d<<=1) {
-    unsigned int num = ShiftUp(totbits, d);
-    if (tx >= d)
-      totbits += num;
-  }
-  int pos = totbits - numbits;
-  for (int y=0;y<yloops;y++) {
-    int ypos = MINMAX_H*blockIdx.y + y;
-    if (ptbits & (1 << y) && pos<MEMWID) {
-      points[2*pos + 0] = xpos - 1;
-      points[2*pos + 1] = ypos;
-      pos ++;
-    }
-  } 
-
-  totbits = Shuffle(totbits, 31);
-  if (tx<totbits) {
-    int xpos = points[2*tx + 0];
-    int ypos = points[2*tx + 1];
-    int ptr = xpos + (ypos + (scale + 1)*height)*pitch;
-    float val = d_Data0[ptr];
-    float *data1 = &d_Data0[ptr];
-    float dxx = 2.0f*val - data1[-1] - data1[1];
-    float dyy = 2.0f*val - data1[-pitch] - data1[pitch];
-    float dxy = 0.25f*(data1[+pitch+1] + data1[-pitch-1] - data1[-pitch+1] - data1[+pitch-1]);
-    float tra = dxx + dyy;
-    float det = dxx*dyy - dxy*dxy;
-    if (tra*tra<edgeLimit*det) {
-      float edge = __fdividef(tra*tra, det);
-      float dx = 0.5f*(data1[1] - data1[-1]);
-      float dy = 0.5f*(data1[pitch] - data1[-pitch]); 
-      float *data0 = d_Data0 + ptr - height*pitch;
-      float *data2 = d_Data0 + ptr + height*pitch;
-      float ds = 0.5f*(data0[0] - data2[0]); 
-      float dss = 2.0f*val - data2[0] - data0[0];
-      float dxs = 0.25f*(data2[1] + data0[-1] - data0[1] - data2[-1]);
-      float dys = 0.25f*(data2[pitch] + data0[-pitch] - data2[-pitch] - data0[pitch]);
-      float idxx = dyy*dss - dys*dys;
-      float idxy = dys*dxs - dxy*dss;   
-      float idxs = dxy*dys - dyy*dxs;
-      float idet = __fdividef(1.0f, idxx*dxx + idxy*dxy + idxs*dxs);
-      float idyy = dxx*dss - dxs*dxs;
-      float idys = dxy*dxs - dxx*dys;
-      float idss = dxx*dyy - dxy*dxy;
-      float pdx = idet*(idxx*dx + idxy*dy + idxs*ds);
-      float pdy = idet*(idxy*dx + idyy*dy + idys*ds);
-      float pds = idet*(idxs*dx + idys*dy + idss*ds);
-      if (pdx<-0.5f || pdx>0.5f || pdy<-0.5f || pdy>0.5f || pds<-0.5f || pds>0.5f) {
-	pdx = __fdividef(dx, dxx);
-	pdy = __fdividef(dy, dyy);
-	pds = __fdividef(ds, dss);
-      }
-      float dval = 0.5f*(dx*pdx + dy*pdy + ds*pds);
-      int maxPts = d_MaxNumPoints;
-      float sc = powf(2.0f, (float)scale/NUM_SCALES) * exp2f(pds*factor);
-      if (sc>=lowestScale) {
-	atomicMax(&d_PointCounter[2*octave+0], d_PointCounter[2*octave-1]); 
-	unsigned int idx = atomicInc(&d_PointCounter[2*octave+0], 0x7fffffff);
-	idx = (idx>=maxPts ? maxPts-1 : idx);
-	d_Sift[idx].xpos = xpos + pdx;
-	d_Sift[idx].ypos = ypos + pdy;
-	d_Sift[idx].scale = sc;
-	d_Sift[idx].sharpness = val + dval;
-	d_Sift[idx].edgeness = edge;
-	d_Sift[idx].subsampling = subsampling;
-      }
-    }
-  }
-}
-
-__global__ void FindPointsMulti(float *d_Data0, SiftPoint *d_Sift, int width, int pitch, int height, float subsampling, float lowestScale, float thresh, float factor, float edgeLimit, int octave)
-{
-  #define MEMWID (MINMAX_W + 2)
-  __shared__ unsigned int cnt;
-  __shared__ unsigned short points[3*MEMWID];
-
-  if (blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0) {
-    atomicMax(&d_PointCounter[2*octave+0], d_PointCounter[2*octave-1]);
-    atomicMax(&d_PointCounter[2*octave+1], d_PointCounter[2*octave-1]);
-  }
-  int tx = threadIdx.x;
-  int block = blockIdx.x/NUM_SCALES; 
-  int scale = blockIdx.x - NUM_SCALES*block;
-  int minx = block*MINMAX_W;
-  int maxx = min(minx + MINMAX_W, width);
-  int xpos = minx + tx;
-  int size = pitch*height;
-  int ptr = size*scale + max(min(xpos-1, width-1), 0);
-
-  int yloops = min(height - MINMAX_H*blockIdx.y, MINMAX_H);
-  float maxv = 0.0f;
-  for (int y=0;y<yloops;y++) {
-    int ypos = MINMAX_H*blockIdx.y + y;
-    int yptr1 = ptr + ypos*pitch;
-    float val = d_Data0[yptr1 + 1*size];
-    maxv = fmaxf(maxv, fabs(val));
-  }
-  maxv = fmaxf(maxv, ShiftDown(maxv, 16, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 8, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 4, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 2, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 1, MINMAX_W));
-  if (Shuffle(maxv, 0)<=thresh)
-    return;
-  
-  if (tx==0)
-    cnt = 0; 
-  __syncthreads();
-
-  for (int y=0;y<yloops;y++) {
-
-    int ypos = MINMAX_H*blockIdx.y + y;
-    int yptr1 = ptr + ypos*pitch;
-    int yptr0 = ptr + max(0,ypos-1)*pitch;
-    int yptr2 = ptr + min(height-1,ypos+1)*pitch;
-    float d20 = d_Data0[yptr0 + 1*size];
-    float d21 = d_Data0[yptr1 + 1*size];
-    float d22 = d_Data0[yptr2 + 1*size];
-    float d31 = d_Data0[yptr1 + 2*size];
-    float d11 = d_Data0[yptr1];
-    
-    float d10 = d_Data0[yptr0];
-    float d12 = d_Data0[yptr2];
-    float ymin1 = fminf(fminf(d10, d11), d12);
-    float ymax1 = fmaxf(fmaxf(d10, d11), d12);
-    float d30 = d_Data0[yptr0 + 2*size];
-    float d32 = d_Data0[yptr2 + 2*size]; 
-    float ymin3 = fminf(fminf(d30, d31), d32);
-    float ymax3 = fmaxf(fmaxf(d30, d31), d32);
-    float ymin2 = fminf(fminf(ymin1, fminf(fminf(d20, d22), d21)), ymin3);
-    float ymax2 = fmaxf(fmaxf(ymax1, fmaxf(fmaxf(d20, d22), d21)), ymax3);
-    
-    float nmin2 = fminf(ShiftUp(ymin2, 1), ShiftDown(ymin2, 1));
-    float nmax2 = fmaxf(ShiftUp(ymax2, 1), ShiftDown(ymax2, 1));
-    if (tx>0 && tx<MINMAX_W+1 && xpos<=maxx) {
-      if (d21<-thresh) {
-	float minv = fminf(fminf(nmin2, ymin1), ymin3);
-	minv = fminf(fminf(minv, d20), d22);
-	if (d21<minv) { 
-	  int pos = atomicInc(&cnt, MEMWID-1);
-	  points[3*pos+0] = xpos - 1;
-	  points[3*pos+1] = ypos;
-	  points[3*pos+2] = scale;
-	}
-      } 
-      if (d21>thresh) {
-	float maxv = fmaxf(fmaxf(nmax2, ymax1), ymax3);
-	maxv = fmaxf(fmaxf(maxv, d20), d22);
-	if (d21>maxv) { 
-	  int pos = atomicInc(&cnt, MEMWID-1);
-	  points[3*pos+0] = xpos - 1;
-	  points[3*pos+1] = ypos;
-	  points[3*pos+2] = scale;
-	}
-      }
-    }
-  }
-  if (tx<cnt) {
-    int xpos = points[3*tx+0];
-    int ypos = points[3*tx+1];
-    int scale = points[3*tx+2];
-    int ptr = xpos + (ypos + (scale+1)*height)*pitch;
-    float val = d_Data0[ptr];
-    float *data1 = &d_Data0[ptr];
-    float dxx = 2.0f*val - data1[-1] - data1[1];
-    float dyy = 2.0f*val - data1[-pitch] - data1[pitch];
-    float dxy = 0.25f*(data1[+pitch+1] + data1[-pitch-1] - data1[-pitch+1] - data1[+pitch-1]);
-    float tra = dxx + dyy;
-    float det = dxx*dyy - dxy*dxy;
-    if (tra*tra<edgeLimit*det) {
-      float edge = __fdividef(tra*tra, det);
-      float dx = 0.5f*(data1[1] - data1[-1]);
-      float dy = 0.5f*(data1[pitch] - data1[-pitch]); 
-      float *data0 = d_Data0 + ptr - height*pitch;
-      float *data2 = d_Data0 + ptr + height*pitch;
-      float ds = 0.5f*(data0[0] - data2[0]); 
-      float dss = 2.0f*val - data2[0] - data0[0];
-      float dxs = 0.25f*(data2[1] + data0[-1] - data0[1] - data2[-1]);
-      float dys = 0.25f*(data2[pitch] + data0[-pitch] - data2[-pitch] - data0[pitch]);
-      float idxx = dyy*dss - dys*dys;
-      float idxy = dys*dxs - dxy*dss;   
-      float idxs = dxy*dys - dyy*dxs;
-      float idet = __fdividef(1.0f, idxx*dxx + idxy*dxy + idxs*dxs);
-      float idyy = dxx*dss - dxs*dxs;
-      float idys = dxy*dxs - dxx*dys;
-      float idss = dxx*dyy - dxy*dxy;
-      float pdx = idet*(idxx*dx + idxy*dy + idxs*ds);
-      float pdy = idet*(idxy*dx + idyy*dy + idys*ds);
-      float pds = idet*(idxs*dx + idys*dy + idss*ds);
-      if (pdx<-0.5f || pdx>0.5f || pdy<-0.5f || pdy>0.5f || pds<-0.5f || pds>0.5f) {
-	pdx = __fdividef(dx, dxx);
-	pdy = __fdividef(dy, dyy);
-	pds = __fdividef(ds, dss);
-      }
-      float dval = 0.5f*(dx*pdx + dy*pdy + ds*pds);
-      int maxPts = d_MaxNumPoints;
-      float sc = powf(2.0f, (float)scale/NUM_SCALES) * exp2f(pds*factor);
-      if (sc>=lowestScale) {
-	atomicMax(&d_PointCounter[2*octave+0], d_PointCounter[2*octave-1]); 
-	unsigned int idx = atomicInc(&d_PointCounter[2*octave+0], 0x7fffffff);
-	idx = (idx>=maxPts ? maxPts-1 : idx);
-	d_Sift[idx].xpos = xpos + pdx;
-	d_Sift[idx].ypos = ypos + pdy;
-	d_Sift[idx].scale = sc;
-	d_Sift[idx].sharpness = val + dval;
-	d_Sift[idx].edgeness = edge;
-	d_Sift[idx].subsampling = subsampling;
-      }
-    }
-  }
-}
-
-
-__global__ void FindPointsMultiOld(float *d_Data0, SiftPoint *d_Sift, int width, int pitch, int height, float subsampling, float lowestScale, float thresh, float factor, float edgeLimit, int octave)
-{
-  #define MEMWID (MINMAX_W + 2)
-  __shared__ float ymin1[MEMWID], ymin2[MEMWID], ymin3[MEMWID];
-  __shared__ float ymax1[MEMWID], ymax2[MEMWID], ymax3[MEMWID];
-  __shared__ unsigned int cnt;
-  __shared__ unsigned short points[3*MEMWID];
-
-  if (blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0) {
-    atomicMax(&d_PointCounter[2*octave+0], d_PointCounter[2*octave-1]); 
-    atomicMax(&d_PointCounter[2*octave+1], d_PointCounter[2*octave-1]);
-  }
-  int tx = threadIdx.x;
-  int block = blockIdx.x/NUM_SCALES; 
-  int scale = blockIdx.x - NUM_SCALES*block;
-  int minx = block*MINMAX_W;
-  int maxx = min(minx + MINMAX_W, width);
-  int xpos = minx + tx;
-  int size = pitch*height;
-  int ptr = size*scale + max(min(xpos-1, width-1), 0);
-
-  int yloops = min(height - MINMAX_H*blockIdx.y, MINMAX_H);
-  float maxv = 0.0f;
-  for (int y=0;y<yloops;y++) {
-    int ypos = MINMAX_H*blockIdx.y + y;
-    int yptr1 = ptr + ypos*pitch;
-    float val = d_Data0[yptr1 + 1*size];
-    maxv = fmaxf(maxv, fabs(val));
-  }
-  maxv = fmaxf(maxv, ShiftDown(maxv, 16, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 8, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 4, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 2, MINMAX_W));
-  maxv = fmaxf(maxv, ShiftDown(maxv, 1, MINMAX_W));
-  if (Shuffle(maxv, 0)<=thresh)
-    return;
-  
-  if (tx==0)
-    cnt = 0; 
-  __syncthreads();
-
-  for (int y=0;y<yloops;y++) {
-
-    int ypos = MINMAX_H*blockIdx.y + y;
-    int yptr1 = ptr + ypos*pitch;
-    int yptr0 = ptr + max(0,ypos-1)*pitch;
-    int yptr2 = ptr + min(height-1,ypos+1)*pitch;
-    float d20 = d_Data0[yptr0 + 1*size];
-    float d21 = d_Data0[yptr1 + 1*size];
-    float d22 = d_Data0[yptr2 + 1*size];
-    float d31 = d_Data0[yptr1 + 2*size];
-    float d11 = d_Data0[yptr1];
-
-    float d10 = d_Data0[yptr0];
-    float d12 = d_Data0[yptr2];
-    ymin1[tx] = fminf(fminf(d10, d11), d12);
-    ymax1[tx] = fmaxf(fmaxf(d10, d11), d12);
-    float d30 = d_Data0[yptr0 + 2*size];
-    float d32 = d_Data0[yptr2 + 2*size]; 
-    ymin3[tx] = fminf(fminf(d30, d31), d32);
-    ymax3[tx] = fmaxf(fmaxf(d30, d31), d32);
-    ymin2[tx] = fminf(fminf(ymin1[tx], fminf(fminf(d20, d22), d21)), ymin3[tx]);
-    ymax2[tx] = fmaxf(fmaxf(ymax1[tx], fmaxf(fmaxf(d20, d22), d21)), ymax3[tx]);
-    
-    __syncthreads(); 
-
-    if (tx>0 && tx<MINMAX_W+1 && xpos<=maxx) {
-      if (d21<-thresh) {
-	float minv = fminf(fminf(fminf(ymin2[tx-1], ymin2[tx+1]), ymin1[tx]), ymin3[tx]);
-	minv = fminf(fminf(minv, d20), d22);
-	if (d21<minv) { 
-	  int pos = atomicInc(&cnt, MEMWID-1);
-	  points[3*pos+0] = xpos - 1;
-	  points[3*pos+1] = ypos;
-	  points[3*pos+2] = scale;
-	}
-      } 
-      if (d21>thresh) {
-	float maxv = fmaxf(fmaxf(fmaxf(ymax2[tx-1], ymax2[tx+1]), ymax1[tx]), ymax3[tx]);
-	maxv = fmaxf(fmaxf(maxv, d20), d22);
-	if (d21>maxv) { 
-	  int pos = atomicInc(&cnt, MEMWID-1);
-	  points[3*pos+0] = xpos - 1;
-	  points[3*pos+1] = ypos;
-	  points[3*pos+2] = scale;
-	}
-      }
-    }
-    __syncthreads();
-  }
-  if (tx<cnt) {
-    int xpos = points[3*tx+0];
-    int ypos = points[3*tx+1];
-    int scale = points[3*tx+2];
-    int ptr = xpos + (ypos + (scale+1)*height)*pitch;
-    float val = d_Data0[ptr];
-    float *data1 = &d_Data0[ptr];
-    float dxx = 2.0f*val - data1[-1] - data1[1];
-    float dyy = 2.0f*val - data1[-pitch] - data1[pitch];
-    float dxy = 0.25f*(data1[+pitch+1] + data1[-pitch-1] - data1[-pitch+1] - data1[+pitch-1]);
-    float tra = dxx + dyy;
-    float det = dxx*dyy - dxy*dxy;
-    if (tra*tra<edgeLimit*det) {
-      float edge = __fdividef(tra*tra, det);
-      float dx = 0.5f*(data1[1] - data1[-1]);
-      float dy = 0.5f*(data1[pitch] - data1[-pitch]); 
-      float *data0 = d_Data0 + ptr - height*pitch;
-      float *data2 = d_Data0 + ptr + height*pitch;
-      float ds = 0.5f*(data0[0] - data2[0]); 
-      float dss = 2.0f*val - data2[0] - data0[0];
-      float dxs = 0.25f*(data2[1] + data0[-1] - data0[1] - data2[-1]);
-      float dys = 0.25f*(data2[pitch] + data0[-pitch] - data2[-pitch] - data0[pitch]);
-      float idxx = dyy*dss - dys*dys;
-      float idxy = dys*dxs - dxy*dss;   
-      float idxs = dxy*dys - dyy*dxs;
-      float idet = __fdividef(1.0f, idxx*dxx + idxy*dxy + idxs*dxs);
-      float idyy = dxx*dss - dxs*dxs;
-      float idys = dxy*dxs - dxx*dys;
-      float idss = dxx*dyy - dxy*dxy;
-      float pdx = idet*(idxx*dx + idxy*dy + idxs*ds);
-      float pdy = idet*(idxy*dx + idyy*dy + idys*ds);
-      float pds = idet*(idxs*dx + idys*dy + idss*ds);
-      if (pdx<-0.5f || pdx>0.5f || pdy<-0.5f || pdy>0.5f || pds<-0.5f || pds>0.5f) {
-	pdx = __fdividef(dx, dxx);
-	pdy = __fdividef(dy, dyy);
-	pds = __fdividef(ds, dss);
-      }
-      float dval = 0.5f*(dx*pdx + dy*pdy + ds*pds);
-      int maxPts = d_MaxNumPoints;
-      float sc = powf(2.0f, (float)scale/NUM_SCALES) * exp2f(pds*factor);
-      if (sc>=lowestScale) {
-	unsigned int idx = atomicInc(&d_PointCounter[2*octave+0], 0x7fffffff);
-	idx = (idx>=maxPts ? maxPts-1 : idx);
-	d_Sift[idx].xpos = xpos + pdx;
-	d_Sift[idx].ypos = ypos + pdy;
-	d_Sift[idx].scale = sc;
-	d_Sift[idx].sharpness = val + dval;
-	d_Sift[idx].edgeness = edge;
-	d_Sift[idx].subsampling = subsampling;
-      }
-    }
-  }
-}
-
-
-__global__ void LaplaceMultiTex(cudaTextureObject_t texObj, float *d_Result, int width, int pitch, int height, int octave)
-{
-  __shared__ float data1[(LAPLACE_W + 2*LAPLACE_R)*LAPLACE_S];
-  __shared__ float data2[LAPLACE_W*LAPLACE_S];
-  const int tx = threadIdx.x;
-  const int xp = blockIdx.x*LAPLACE_W + tx;
-  const int yp = blockIdx.y;
-  const int scale = threadIdx.y;
-  float *kernel = d_LaplaceKernel + octave*12*16 + scale*16;
-  float *sdata1 = data1 + (LAPLACE_W + 2*LAPLACE_R)*scale; 
-  float x = xp-3.5;
-  float y = yp+0.5;
-  sdata1[tx] = kernel[0]*tex2D<float>(texObj, x, y) + 
-    kernel[1]*(tex2D<float>(texObj, x, y-1.0) + tex2D<float>(texObj, x, y+1.0)) + 
-    kernel[2]*(tex2D<float>(texObj, x, y-2.0) + tex2D<float>(texObj, x, y+2.0)) + 
-    kernel[3]*(tex2D<float>(texObj, x, y-3.0) + tex2D<float>(texObj, x, y+3.0)) + 
-    kernel[4]*(tex2D<float>(texObj, x, y-4.0) + tex2D<float>(texObj, x, y+4.0));
-  __syncthreads();
-  float *sdata2 = data2 + LAPLACE_W*scale; 
-  if (tx<LAPLACE_W) {
-    sdata2[tx] = kernel[0]*sdata1[tx+4] + 
-      kernel[1]*(sdata1[tx+3] + sdata1[tx+5]) + 
-      kernel[2]*(sdata1[tx+2] + sdata1[tx+6]) + 
-      kernel[3]*(sdata1[tx+1] + sdata1[tx+7]) + 
-      kernel[4]*(sdata1[tx+0] + sdata1[tx+8]);
-  }
-  __syncthreads(); 
-  if (tx<LAPLACE_W && scale<LAPLACE_S-1 && xp<width) 
-    d_Result[scale*height*pitch + yp*pitch + xp] = sdata2[tx] - sdata2[tx+LAPLACE_W];
-}
-
-
-__global__ void LaplaceMultiMem(float *d_Image, float *d_Result, int width, int pitch, int height, int octave)
-{
-  __shared__ float buff[(LAPLACE_W + 2*LAPLACE_R)*LAPLACE_S];
-  const int tx = threadIdx.x;
-  const int xp = blockIdx.x*LAPLACE_W + tx;
-  const int yp = blockIdx.y;
-  float *data = d_Image + max(min(xp - LAPLACE_R, width-1), 0);
-  float temp[2*LAPLACE_R + 1], kern[LAPLACE_S][LAPLACE_R + 1];
-  if (xp<(width + 2*LAPLACE_R)) {
-    for (int i=0;i<=2*LAPLACE_R;i++)
-      temp[i] = data[max(0, min(yp + i - LAPLACE_R, height - 1))*pitch];
-    for (int scale=0;scale<LAPLACE_S;scale++) {
-      float *buf = buff + (LAPLACE_W + 2*LAPLACE_R)*scale; 
-      float *kernel = d_LaplaceKernel + octave*12*16 + scale*16; 
-      for (int i=0;i<=LAPLACE_R;i++)
-	kern[scale][i] = kernel[i];
-      float sum = kern[scale][0]*temp[LAPLACE_R];
-#pragma unroll      
-      for (int j=1;j<=LAPLACE_R;j++)
-	sum += kern[scale][j]*(temp[LAPLACE_R - j] + temp[LAPLACE_R + j]);
-      buf[tx] = sum;
-    }
-  }
-  __syncthreads();
-  if (tx<LAPLACE_W && xp<width) {
-    int scale = 0;
-    float oldRes = kern[scale][0]*buff[tx + LAPLACE_R];
-#pragma unroll
-    for (int j=1;j<=LAPLACE_R;j++)
-      oldRes += kern[scale][j]*(buff[tx + LAPLACE_R - j] + buff[tx + LAPLACE_R + j]); 
-    for (int scale=1;scale<LAPLACE_S;scale++) {
-      float *buf = buff + (LAPLACE_W + 2*LAPLACE_R)*scale; 
-      float res = kern[scale][0]*buf[tx + LAPLACE_R];
-#pragma unroll
-      for (int j=1;j<=LAPLACE_R;j++)
-	res += kern[scale][j]*(buf[tx + LAPLACE_R - j] + buf[tx + LAPLACE_R + j]); 
-      d_Result[(scale-1)*height*pitch + yp*pitch + xp] = res - oldRes;
-      oldRes = res;
-    }
-  }
-}
-
-__global__ void LaplaceMultiMemWide(float *d_Image, float *d_Result, int width, int pitch, int height, int octave)
-{
-  __shared__ float buff[(LAPLACE_W + 2*LAPLACE_R)*LAPLACE_S];
-  const int tx = threadIdx.x;
-  const int xp = blockIdx.x*LAPLACE_W + tx;
-  const int xp4 = blockIdx.x*LAPLACE_W + 4*tx;
-  const int yp = blockIdx.y;
-  float kern[LAPLACE_S][LAPLACE_R+1];
-  float *data = d_Image + max(min(xp - 4, width-1), 0);
-  float temp[9]; 
-  if (xp<(width + 2*LAPLACE_R)) {
-    for (int i=0;i<4;i++)
-      temp[i] = data[max(0, min(yp+i-4, height-1))*pitch];
-    for (int i=4;i<8+1;i++)
-      temp[i] = data[min(yp+i-4, height-1)*pitch];
-    for (int scale=0;scale<LAPLACE_S;scale++) {
-      float *kernel = d_LaplaceKernel + octave*12*16 + scale*16; 
-      for (int i=0;i<=LAPLACE_R;i++)
-	kern[scale][i] = kernel[LAPLACE_R - i];
-      float *buf = buff + (LAPLACE_W + 2*LAPLACE_R)*scale; 
-      buf[tx] = kern[scale][4]*temp[4] +
-	kern[scale][3]*(temp[3] + temp[5]) + kern[scale][2]*(temp[2] + temp[6]) + 
-	kern[scale][1]*(temp[1] + temp[7]) + kern[scale][0]*(temp[0] + temp[8]);
-    }
-  }
-  __syncthreads();
-  if (tx<LAPLACE_W/4 && xp4<width) {
-    float4 b0 = reinterpret_cast<float4*>(buff)[tx+0];
-    float4 b1 = reinterpret_cast<float4*>(buff)[tx+1];
-    float4 b2 = reinterpret_cast<float4*>(buff)[tx+2];
-    float4 old4, new4, dif4;
-    old4.x = kern[0][4]*b1.x + kern[0][3]*(b0.w + b1.y) + kern[0][2]*(b0.z + b1.z) +
-      kern[0][1]*(b0.y + b1.w) + kern[0][0]*(b0.x + b2.x);
-    old4.y = kern[0][4]*b1.y + kern[0][3]*(b1.x + b1.z) + kern[0][2]*(b0.w + b1.w) +
-      kern[0][1]*(b0.z + b2.x) + kern[0][0]*(b0.y + b2.y);
-    old4.z = kern[0][4]*b1.z + kern[0][3]*(b1.y + b1.w) + kern[0][2]*(b1.x + b2.x) +
-      kern[0][1]*(b0.w + b2.y) + kern[0][0]*(b0.z + b2.z);
-    old4.w = kern[0][4]*b1.w + kern[0][3]*(b1.z + b2.x) + kern[0][2]*(b1.y + b2.y) +
-      kern[0][1]*(b1.x + b2.z) + kern[0][0]*(b0.w + b2.w);
-    for (int scale=1;scale<LAPLACE_S;scale++) {
-      float *buf = buff + (LAPLACE_W + 2*LAPLACE_R)*scale; 
-      float4 b0 = reinterpret_cast<float4*>(buf)[tx+0];
-      float4 b1 = reinterpret_cast<float4*>(buf)[tx+1];
-      float4 b2 = reinterpret_cast<float4*>(buf)[tx+2];
-      new4.x = kern[scale][4]*b1.x + kern[scale][3]*(b0.w + b1.y) +
-	kern[scale][2]*(b0.z + b1.z) + kern[scale][1]*(b0.y + b1.w) +
-	kern[scale][0]*(b0.x + b2.x);
-      new4.y = kern[scale][4]*b1.y + kern[scale][3]*(b1.x + b1.z) +
-	kern[scale][2]*(b0.w + b1.w) + kern[scale][1]*(b0.z + b2.x) +
-	kern[scale][0]*(b0.y + b2.y);
-      new4.z = kern[scale][4]*b1.z + kern[scale][3]*(b1.y + b1.w) +
-	kern[scale][2]*(b1.x + b2.x) + kern[scale][1]*(b0.w + b2.y) +
-	kern[scale][0]*(b0.z + b2.z);
-      new4.w = kern[scale][4]*b1.w + kern[scale][3]*(b1.z + b2.x) +
-	kern[scale][2]*(b1.y + b2.y) + kern[scale][1]*(b1.x + b2.z) +
-	kern[scale][0]*(b0.w + b2.w);
-      dif4.x = new4.x - old4.x;
-      dif4.y = new4.y - old4.y;
-      dif4.z = new4.z - old4.z;
-      dif4.w = new4.w - old4.w;
-      reinterpret_cast<float4*>(&d_Result[(scale-1)*height*pitch + yp*pitch + xp4])[0] = dif4;
-      old4 = new4;
-    }
-  }
-}
-
-__global__ void LaplaceMultiMemTest(float *d_Image, float *d_Result, int width, int pitch, int height, int octave)
-{
-  __shared__ float data1[(LAPLACE_W + 2*LAPLACE_R)*LAPLACE_S];
-  __shared__ float data2[LAPLACE_W*LAPLACE_S];
-  const int tx = threadIdx.x;
-  const int xp = blockIdx.x*LAPLACE_W + tx;
-  const int yp = LAPLACE_H*blockIdx.y;
-  const int scale = threadIdx.y;
-  float *kernel = d_LaplaceKernel + octave*12*16 + scale*16; 
-  float *sdata1 = data1 + (LAPLACE_W + 2*LAPLACE_R)*scale; 
-  float *data = d_Image + max(min(xp - 4, width-1), 0);
-  int h = height-1;
-  float temp[8+LAPLACE_H], kern[LAPLACE_R+1];
-  for (int i=0;i<4;i++)
-    temp[i] = data[max(0, min(yp+i-4, h))*pitch];
-  for (int i=4;i<8+LAPLACE_H;i++)
-    temp[i] = data[min(yp+i-4, h)*pitch];
-  for (int i=0;i<=LAPLACE_R;i++)
-    kern[i] = kernel[LAPLACE_R - i];
-  for (int j=0;j<LAPLACE_H;j++) {
-    sdata1[tx] = kern[4]*temp[4+j] +
-      kern[3]*(temp[3+j] + temp[5+j]) + kern[2]*(temp[2+j] + temp[6+j]) + 
-      kern[1]*(temp[1+j] + temp[7+j]) + kern[0]*(temp[0+j] + temp[8+j]);
-    __syncthreads();
-    float *sdata2 = data2 + LAPLACE_W*scale; 
-    if (tx<LAPLACE_W) {
-      sdata2[tx] = kern[4]*sdata1[tx+4] + 
-	kern[3]*(sdata1[tx+3] + sdata1[tx+5]) + kern[2]*(sdata1[tx+2] + sdata1[tx+6]) + 
-	kern[1]*(sdata1[tx+1] + sdata1[tx+7]) + kern[0]*(sdata1[tx+0] + sdata1[tx+8]);
-    }
-    __syncthreads(); 
-    if (tx<LAPLACE_W && scale<LAPLACE_S-1 && xp<width && (yp+j)<height) 
-      d_Result[scale*height*pitch + (yp+j)*pitch + xp] = sdata2[tx] - sdata2[tx+LAPLACE_W];
-  }
-}
-
-__global__ void LaplaceMultiMemOld(float *d_Image, float *d_Result, int width, int pitch, int height, int octave)
-{
-  __shared__ float data1[(LAPLACE_W + 2*LAPLACE_R)*LAPLACE_S];
-  __shared__ float data2[LAPLACE_W*LAPLACE_S];
-  const int tx = threadIdx.x;
-  const int xp = blockIdx.x*LAPLACE_W + tx;
-  const int yp = blockIdx.y;
-  const int scale = threadIdx.y;
-  float *kernel = d_LaplaceKernel + octave*12*16 + scale*16; 
-  float *sdata1 = data1 + (LAPLACE_W + 2*LAPLACE_R)*scale; 
-  float *data = d_Image + max(min(xp - 4, width-1), 0);
-  int h = height-1;
-  sdata1[tx] = kernel[0]*data[min(yp, h)*pitch] +
-    kernel[1]*(data[max(0, min(yp-1, h))*pitch] + data[min(yp+1, h)*pitch]) + 
-    kernel[2]*(data[max(0, min(yp-2, h))*pitch] + data[min(yp+2, h)*pitch]) + 
-    kernel[3]*(data[max(0, min(yp-3, h))*pitch] + data[min(yp+3, h)*pitch]) + 
-    kernel[4]*(data[max(0, min(yp-4, h))*pitch] + data[min(yp+4, h)*pitch]);
-  __syncthreads();
-  float *sdata2 = data2 + LAPLACE_W*scale; 
-  if (tx<LAPLACE_W) {
-    sdata2[tx] = kernel[0]*sdata1[tx+4] + 
-      kernel[1]*(sdata1[tx+3] + sdata1[tx+5]) +
-      kernel[2]*(sdata1[tx+2] + sdata1[tx+6]) + 
-      kernel[3]*(sdata1[tx+1] + sdata1[tx+7]) +
-      kernel[4]*(sdata1[tx+0] + sdata1[tx+8]);
-  }
-  __syncthreads(); 
-  if (tx<LAPLACE_W && scale<LAPLACE_S-1 && xp<width) 
-    d_Result[scale*height*pitch + yp*pitch + xp] = sdata2[tx] - sdata2[tx+LAPLACE_W];
-}
-
-__global__ void LowPass(float *d_Image, float *d_Result, int width, int pitch, int height)
-{
-  __shared__ float buffer[(LOWPASS_W + 2*LOWPASS_R)*LOWPASS_H];
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int xp = blockIdx.x*LOWPASS_W + tx;
-  const int yp = blockIdx.y*LOWPASS_H + ty;
-  float *kernel = d_LowPassKernel;
-  float *data = d_Image + max(min(xp - 4, width-1), 0);
-  float *buff = buffer + ty*(LOWPASS_W + 2*LOWPASS_R);
-  int h = height-1;
-  if (yp<height) 
-    buff[tx] = kernel[4]*data[min(yp, h)*pitch] +
-      kernel[3]*(data[max(0, min(yp-1, h))*pitch] + data[min(yp+1, h)*pitch]) + 
-      kernel[2]*(data[max(0, min(yp-2, h))*pitch] + data[min(yp+2, h)*pitch]) + 
-      kernel[1]*(data[max(0, min(yp-3, h))*pitch] + data[min(yp+3, h)*pitch]) + 
-      kernel[0]*(data[max(0, min(yp-4, h))*pitch] + data[min(yp+4, h)*pitch]);
-  __syncthreads();
-  if (tx<LOWPASS_W && xp<width && yp<height)
-    d_Result[yp*pitch + xp] = kernel[4]*buff[tx+4] + 
-      kernel[3]*(buff[tx+3] + buff[tx+5]) + kernel[2]*(buff[tx+2] + buff[tx+6]) + 
-      kernel[1]*(buff[tx+1] + buff[tx+7]) + kernel[0]*(buff[tx+0] + buff[tx+8]);
-}
-
-__global__ void LowPassBlockOld(float *d_Image, float *d_Result, int width, int pitch, int height)
-{
-  __shared__ float xrows[16][32];          
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int xp = blockIdx.x*LOWPASS_W + tx;
-  const int yp = blockIdx.y*LOWPASS_H + ty;
-  const int N = 16;
-  float *k = d_LowPassKernel;
-  int xl = max(min(xp - 4, width-1), 0);
-  for (int l=-8;l<=LOWPASS_H;l+=4) {
-    if (l<LOWPASS_H) {
-      int yl = max(min(yp + l + 4, height-1), 0);
-      float val = d_Image[yl*pitch + xl];
-      xrows[(l + 8 + ty)%N][tx] = k[4]*ShiftDown(val, 4) +
-	k[3]*(ShiftDown(val, 5) + ShiftDown(val, 3)) +
-	k[2]*(ShiftDown(val, 6) + ShiftDown(val, 2)) +
-	k[1]*(ShiftDown(val, 7) + ShiftDown(val, 1)) +
-	k[0]*(ShiftDown(val, 8) + val);
-    }
-    if (l>=4) {
-      int ys = yp + l - 4;
-      if (xp<width && ys<height && tx<LOWPASS_W)
-	d_Result[ys*pitch + xp] = k[4]*xrows[(l + 0 + ty)%N][tx] +
-	     k[3]*(xrows[(l - 1 + ty)%N][tx] + xrows[(l + 1 + ty)%N][tx]) +
-	     k[2]*(xrows[(l - 2 + ty)%N][tx] + xrows[(l + 2 + ty)%N][tx]) +
-	     k[1]*(xrows[(l - 3 + ty)%N][tx] + xrows[(l + 3 + ty)%N][tx]) +
-	     k[0]*(xrows[(l - 4 + ty)%N][tx] + xrows[(l + 4 + ty)%N][tx]);
-    }
-    if (l>=0)
-      __syncthreads();
-  }
-}
-
-__global__ void LowPassBlock(float *d_Image, float *d_Result, int width, int pitch, int height)
-{
-  __shared__ float xrows[16][32];          
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int xp = blockIdx.x*LOWPASS_W + tx;
-  const int yp = blockIdx.y*LOWPASS_H + ty;
-  const int N = 16;
-  float *k = d_LowPassKernel;
-  int xl = max(min(xp - 4, width-1), 0);
-#pragma unroll
-  for (int l=-8;l<4;l+=4) {
-    int ly = l + ty;
-    int yl = max(min(yp + l + 4, height-1), 0);
-    float val = d_Image[yl*pitch + xl];
-    val = k[4]*ShiftDown(val, 4) +
-      k[3]*(ShiftDown(val, 5) + ShiftDown(val, 3)) +
-      k[2]*(ShiftDown(val, 6) + ShiftDown(val, 2)) +
-      k[1]*(ShiftDown(val, 7) + ShiftDown(val, 1)) +
-      k[0]*(ShiftDown(val, 8) + val);
-    xrows[ly + 8][tx] = val;
-  }
-  __syncthreads();
-#pragma unroll
-  for (int l=4;l<LOWPASS_H;l+=4) {
-    int ly = l + ty;
-    int yl = min(yp + l + 4, height-1);
-    float val = d_Image[yl*pitch + xl];
-    val = k[4]*ShiftDown(val, 4) +
-      k[3]*(ShiftDown(val, 5) + ShiftDown(val, 3)) +
-      k[2]*(ShiftDown(val, 6) + ShiftDown(val, 2)) +
-      k[1]*(ShiftDown(val, 7) + ShiftDown(val, 1)) +
-      k[0]*(ShiftDown(val, 8) + val);
-    xrows[(ly + 8)%N][tx] = val;
-    int ys = yp + l - 4;
-    if (xp<width && ys<height && tx<LOWPASS_W)
-      d_Result[ys*pitch + xp] = k[4]*xrows[(ly + 0)%N][tx] +
-		       k[3]*(xrows[(ly - 1)%N][tx] + xrows[(ly + 1)%N][tx]) +
-		       k[2]*(xrows[(ly - 2)%N][tx] + xrows[(ly + 2)%N][tx]) +
-		       k[1]*(xrows[(ly - 3)%N][tx] + xrows[(ly + 3)%N][tx]) +
-		       k[0]*(xrows[(ly - 4)%N][tx] + xrows[(ly + 4)%N][tx]);
-    __syncthreads();
-  }
-  int ly = LOWPASS_H + ty;
-  int ys = yp + LOWPASS_H - 4;
-  if (xp<width && ys<height && tx<LOWPASS_W)
-    d_Result[ys*pitch + xp] = k[4]*xrows[(ly + 0)%N][tx] +
-		     k[3]*(xrows[(ly - 1)%N][tx] + xrows[(ly + 1)%N][tx]) +
-		     k[2]*(xrows[(ly - 2)%N][tx] + xrows[(ly + 2)%N][tx]) +
-		     k[1]*(xrows[(ly - 3)%N][tx] + xrows[(ly + 3)%N][tx]) +
-		     k[0]*(xrows[(ly - 4)%N][tx] + xrows[(ly + 4)%N][tx]);
-}
-
diff --git a/cudaSiftH.cu b/cudaSiftH.cu
deleted file mode 100644
index cbd47ce..0000000
--- a/cudaSiftH.cu
+++ /dev/null
@@ -1,515 +0,0 @@
-//********************************************************//
-// CUDA SIFT extractor by Mårten Björkman aka Celebrandil //
-//********************************************************//  
-
-#include <cstdio>
-#include <cstring>
-#include <cmath>
-#include <iostream>
-#include <algorithm>
-#include "cudautils.h"
-
-#include "cudaImage.h"
-#include "cudaSift.h"
-#include "cudaSiftD.h"
-#include "cudaSiftH.h"
-
-#include "cudaSiftD.cu"
-
-void InitCuda(int devNum)
-{
-  int nDevices;
-  cudaGetDeviceCount(&nDevices);
-  if (!nDevices) {
-    std::cerr << "No CUDA devices available" << std::endl;
-    return;
-  }
-  devNum = std::min(nDevices-1, devNum);
-  deviceInit(devNum);  
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, devNum);
-  printf("Device Number: %d\n", devNum);
-  printf("  Device name: %s\n", prop.name);
-  printf("  Memory Clock Rate (MHz): %d\n", prop.memoryClockRate/1000);
-  printf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
-  printf("  Peak Memory Bandwidth (GB/s): %.1f\n\n",
-	 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
-}
-
-float *AllocSiftTempMemory(int width, int height, int numOctaves, bool scaleUp)
-{
-  TimerGPU timer(0);
-  const int nd = NUM_SCALES + 3;
-  int w = width*(scaleUp ? 2 : 1); 
-  int h = height*(scaleUp ? 2 : 1);
-  int p = iAlignUp(w, 128);
-  int size = h*p;                 // image sizes
-  int sizeTmp = nd*h*p;           // laplace buffer sizes
-  for (int i=0;i<numOctaves;i++) {
-    w /= 2;
-    h /= 2;
-    int p = iAlignUp(w, 128);
-    size += h*p;
-    sizeTmp += nd*h*p; 
-  }
-  float *memoryTmp = NULL; 
-  size_t pitch;
-  size += sizeTmp;
-  safeCall(cudaMallocPitch((void **)&memoryTmp, &pitch, (size_t)4096, (size+4095)/4096*sizeof(float)));
-#ifdef VERBOSE
-  printf("Allocated memory size: %d bytes\n", size);
-  printf("Memory allocation time =      %.2f ms\n\n", timer.read());
-#endif
-  return memoryTmp;
-}
-
-void FreeSiftTempMemory(float *memoryTmp)
-{
-  if (memoryTmp)
-    safeCall(cudaFree(memoryTmp));
-}
-
-void ExtractSift(SiftData &siftData, CudaImage &img, int numOctaves, double initBlur, float thresh, float lowestScale, bool scaleUp, float *tempMemory) 
-{
-  TimerGPU timer(0);
-  unsigned int *d_PointCounterAddr;
-  safeCall(cudaGetSymbolAddress((void**)&d_PointCounterAddr, d_PointCounter));
-  safeCall(cudaMemset(d_PointCounterAddr, 0, (8*2+1)*sizeof(int)));
-  safeCall(cudaMemcpyToSymbol(d_MaxNumPoints, &siftData.maxPts, sizeof(int)));
-
-  const int nd = NUM_SCALES + 3;
-  int w = img.width*(scaleUp ? 2 : 1);
-  int h = img.height*(scaleUp ? 2 : 1);
-  int p = iAlignUp(w, 128);
-  int width = w, height = h;
-  int size = h*p;                 // image sizes
-  int sizeTmp = nd*h*p;           // laplace buffer sizes
-  for (int i=0;i<numOctaves;i++) {
-    w /= 2;
-    h /= 2;
-    int p = iAlignUp(w, 128);
-    size += h*p;
-    sizeTmp += nd*h*p; 
-  }
-  float *memoryTmp = tempMemory; 
-  size += sizeTmp;
-  if (!tempMemory) {
-    size_t pitch;
-    safeCall(cudaMallocPitch((void **)&memoryTmp, &pitch, (size_t)4096, (size+4095)/4096*sizeof(float)));
-#ifdef VERBOSE
-    printf("Allocated memory size: %d bytes\n", size);
-    printf("Memory allocation time =      %.2f ms\n\n", timer.read());
-#endif
-  }
-  float *memorySub = memoryTmp + sizeTmp;
-
-  CudaImage lowImg;
-  lowImg.Allocate(width, height, iAlignUp(width, 128), false, memorySub);
-  if (!scaleUp) {
-    float kernel[8*12*16];
-    PrepareLaplaceKernels(numOctaves, 0.0f, kernel);
-    safeCall(cudaMemcpyToSymbolAsync(d_LaplaceKernel, kernel, 8*12*16*sizeof(float)));
-    LowPass(lowImg, img, max(initBlur, 0.001f));
-    TimerGPU timer1(0);
-    ExtractSiftLoop(siftData, lowImg, numOctaves, 0.0f, thresh, lowestScale, 1.0f, memoryTmp, memorySub + height*iAlignUp(width, 128));
-    safeCall(cudaMemcpy(&siftData.numPts, &d_PointCounterAddr[2*numOctaves], sizeof(int), cudaMemcpyDeviceToHost)); 
-    siftData.numPts = (siftData.numPts<siftData.maxPts ? siftData.numPts : siftData.maxPts);
-    printf("SIFT extraction time =        %.2f ms %d\n", timer1.read(), siftData.numPts);
-  } else {
-    CudaImage upImg;
-    upImg.Allocate(width, height, iAlignUp(width, 128), false, memoryTmp);
-    TimerGPU timer1(0); 
-    ScaleUp(upImg, img);
-    LowPass(lowImg, upImg, max(initBlur, 0.001f));
-    float kernel[8*12*16];
-    PrepareLaplaceKernels(numOctaves, 0.0f, kernel);
-    safeCall(cudaMemcpyToSymbolAsync(d_LaplaceKernel, kernel, 8*12*16*sizeof(float)));
-    ExtractSiftLoop(siftData, lowImg, numOctaves, 0.0f, thresh, lowestScale*2.0f, 1.0f, memoryTmp, memorySub + height*iAlignUp(width, 128));
-    safeCall(cudaMemcpy(&siftData.numPts, &d_PointCounterAddr[2*numOctaves], sizeof(int), cudaMemcpyDeviceToHost)); 
-    siftData.numPts = (siftData.numPts<siftData.maxPts ? siftData.numPts : siftData.maxPts);
-    RescalePositions(siftData, 0.5f);
-    printf("SIFT extraction time =        %.2f ms\n", timer1.read());
-  } 
-  
-  if (!tempMemory)
-    safeCall(cudaFree(memoryTmp));
-#ifdef MANAGEDMEM
-  safeCall(cudaDeviceSynchronize());
-#else
-  if (siftData.h_data)
-    safeCall(cudaMemcpy(siftData.h_data, siftData.d_data, sizeof(SiftPoint)*siftData.numPts, cudaMemcpyDeviceToHost));
-#endif
-  double totTime = timer.read();
-  printf("Incl prefiltering & memcpy =  %.2f ms %d\n\n", totTime, siftData.numPts);
-}
-
-int ExtractSiftLoop(SiftData &siftData, CudaImage &img, int numOctaves, double initBlur, float thresh, float lowestScale, float subsampling, float *memoryTmp, float *memorySub) 
-{
-#ifdef VERBOSE
-  TimerGPU timer(0);
-#endif
-  int w = img.width;
-  int h = img.height;
-  if (numOctaves>1) {
-    CudaImage subImg;
-    int p = iAlignUp(w/2, 128);
-    subImg.Allocate(w/2, h/2, p, false, memorySub); 
-    ScaleDown(subImg, img, 0.5f);
-    float totInitBlur = (float)sqrt(initBlur*initBlur + 0.5f*0.5f) / 2.0f;
-    ExtractSiftLoop(siftData, subImg, numOctaves-1, totInitBlur, thresh, lowestScale, subsampling*2.0f, memoryTmp, memorySub + (h/2)*p);
-  }
-  ExtractSiftOctave(siftData, img, numOctaves, thresh, lowestScale, subsampling, memoryTmp);
-#ifdef VERBOSE
-  double totTime = timer.read();
-  printf("ExtractSift time total =      %.2f ms %d\n\n", totTime, numOctaves);
-#endif
-  return 0;
-}
-
-void ExtractSiftOctave(SiftData &siftData, CudaImage &img, int octave, float thresh, float lowestScale, float subsampling, float *memoryTmp)
-{
-  const int nd = NUM_SCALES + 3;
-#ifdef VERBOSE
-  unsigned int *d_PointCounterAddr;
-  safeCall(cudaGetSymbolAddress((void**)&d_PointCounterAddr, d_PointCounter));
-  unsigned int fstPts, totPts;
-  safeCall(cudaMemcpy(&fstPts, &d_PointCounterAddr[2*octave-1], sizeof(int), cudaMemcpyDeviceToHost)); 
-  TimerGPU timer0;
-#endif
-  CudaImage diffImg[nd];
-  int w = img.width; 
-  int h = img.height;
-  int p = iAlignUp(w, 128);
-  for (int i=0;i<nd-1;i++) 
-    diffImg[i].Allocate(w, h, p, false, memoryTmp + i*p*h); 
-
-  // Specify texture
-  struct cudaResourceDesc resDesc;
-  memset(&resDesc, 0, sizeof(resDesc));
-  resDesc.resType = cudaResourceTypePitch2D;
-  resDesc.res.pitch2D.devPtr = img.d_data;
-  resDesc.res.pitch2D.width = img.width;
-  resDesc.res.pitch2D.height = img.height;
-  resDesc.res.pitch2D.pitchInBytes = img.pitch*sizeof(float);  
-  resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
-  // Specify texture object parameters
-  struct cudaTextureDesc texDesc;
-  memset(&texDesc, 0, sizeof(texDesc));
-  texDesc.addressMode[0]   = cudaAddressModeClamp;
-  texDesc.addressMode[1]   = cudaAddressModeClamp;
-  texDesc.filterMode       = cudaFilterModeLinear;
-  texDesc.readMode         = cudaReadModeElementType;
-  texDesc.normalizedCoords = 0;
-  // Create texture object
-  cudaTextureObject_t texObj = 0;
-  cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
-
-#ifdef VERBOSE
-  TimerGPU timer1;
-#endif
-  float baseBlur = pow(2.0f, -1.0f/NUM_SCALES);
-  float diffScale = pow(2.0f, 1.0f/NUM_SCALES);
-  LaplaceMulti(texObj, img, diffImg, octave); 
-  FindPointsMulti(diffImg, siftData, thresh, 10.0f, 1.0f/NUM_SCALES, lowestScale/subsampling, subsampling, octave);
-#ifdef VERBOSE
-  double gpuTimeDoG = timer1.read();
-  TimerGPU timer4;
-#endif
-  ComputeOrientations(texObj, img, siftData, octave); 
-  ExtractSiftDescriptors(texObj, siftData, subsampling, octave); 
-  //OrientAndExtract(texObj, siftData, subsampling, octave); 
-  
-  safeCall(cudaDestroyTextureObject(texObj));
-#ifdef VERBOSE
-  double gpuTimeSift = timer4.read();
-  double totTime = timer0.read();
-  printf("GPU time : %.2f ms + %.2f ms + %.2f ms = %.2f ms\n", totTime-gpuTimeDoG-gpuTimeSift, gpuTimeDoG, gpuTimeSift, totTime);
-  safeCall(cudaMemcpy(&totPts, &d_PointCounterAddr[2*octave+1], sizeof(int), cudaMemcpyDeviceToHost));
-  totPts = (totPts<siftData.maxPts ? totPts : siftData.maxPts);
-  if (totPts>0) 
-    printf("           %.2f ms / DoG,  %.4f ms / Sift,  #Sift = %d\n", gpuTimeDoG/NUM_SCALES, gpuTimeSift/(totPts-fstPts), totPts-fstPts); 
-#endif
-}
-
-void InitSiftData(SiftData &data, int num, bool host, bool dev)
-{
-  data.numPts = 0;
-  data.maxPts = num;
-  int sz = sizeof(SiftPoint)*num;
-#ifdef MANAGEDMEM
-  safeCall(cudaMallocManaged((void **)&data.m_data, sz));
-#else
-  data.h_data = NULL;
-  if (host)
-    data.h_data = (SiftPoint *)malloc(sz);
-  data.d_data = NULL;
-  if (dev)
-    safeCall(cudaMalloc((void **)&data.d_data, sz));
-#endif
-}
-
-void FreeSiftData(SiftData &data)
-{
-#ifdef MANAGEDMEM
-  safeCall(cudaFree(data.m_data));
-#else
-  if (data.d_data!=NULL)
-    safeCall(cudaFree(data.d_data));
-  data.d_data = NULL;
-  if (data.h_data!=NULL)
-    free(data.h_data);
-#endif
-  data.numPts = 0;
-  data.maxPts = 0;
-}
-
-void PrintSiftData(SiftData &data)
-{
-#ifdef MANAGEDMEM
-  SiftPoint *h_data = data.m_data;
-#else
-  SiftPoint *h_data = data.h_data;
-  if (data.h_data==NULL) {
-    h_data = (SiftPoint *)malloc(sizeof(SiftPoint)*data.maxPts);
-    safeCall(cudaMemcpy(h_data, data.d_data, sizeof(SiftPoint)*data.numPts, cudaMemcpyDeviceToHost));
-    data.h_data = h_data;
-  }
-#endif
-  for (int i=0;i<data.numPts;i++) {
-    printf("xpos         = %.2f\n", h_data[i].xpos);
-    printf("ypos         = %.2f\n", h_data[i].ypos);
-    printf("scale        = %.2f\n", h_data[i].scale);
-    printf("sharpness    = %.2f\n", h_data[i].sharpness);
-    printf("edgeness     = %.2f\n", h_data[i].edgeness);
-    printf("orientation  = %.2f\n", h_data[i].orientation);
-    printf("score        = %.2f\n", h_data[i].score);
-    float *siftData = (float*)&h_data[i].data;
-    for (int j=0;j<8;j++) {
-      if (j==0) 
-	printf("data = ");
-      else 
-	printf("       ");
-      for (int k=0;k<16;k++)
-	if (siftData[j+8*k]<0.05)
-	  printf(" .   ");
-	else
-	  printf("%.2f ", siftData[j+8*k]);
-      printf("\n");
-    }
-  }
-  printf("Number of available points: %d\n", data.numPts);
-  printf("Number of allocated points: %d\n", data.maxPts);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Host side master functions
-///////////////////////////////////////////////////////////////////////////////
-
-double ScaleDown(CudaImage &res, CudaImage &src, float variance)
-{
-  static float oldVariance = -1.0f;
-  if (res.d_data==NULL || src.d_data==NULL) {
-    printf("ScaleDown: missing data\n");
-    return 0.0;
-  }
-  if (oldVariance!=variance) {
-    float h_Kernel[5];
-    float kernelSum = 0.0f;
-    for (int j=0;j<5;j++) {
-      h_Kernel[j] = (float)expf(-(double)(j-2)*(j-2)/2.0/variance);      
-      kernelSum += h_Kernel[j];
-    }
-    for (int j=0;j<5;j++)
-      h_Kernel[j] /= kernelSum;  
-    safeCall(cudaMemcpyToSymbol(d_ScaleDownKernel, h_Kernel, 5*sizeof(float)));
-    oldVariance = variance;
-  }
-#if 0
-  dim3 blocks(iDivUp(src.width, SCALEDOWN_W), iDivUp(src.height, SCALEDOWN_H));
-  dim3 threads(SCALEDOWN_W + 4, SCALEDOWN_H + 4);
-  ScaleDownDenseShift<<<blocks, threads>>>(res.d_data, src.d_data, src.width, src.pitch, src.height, res.pitch);
-#else
-  dim3 blocks(iDivUp(src.width, SCALEDOWN_W), iDivUp(src.height, SCALEDOWN_H));
-  dim3 threads(SCALEDOWN_W + 4);
-  ScaleDown<<<blocks, threads>>>(res.d_data, src.d_data, src.width, src.pitch, src.height, res.pitch);
-#endif
-  checkMsg("ScaleDown() execution failed\n");
-  return 0.0;
-}
-
-double ScaleUp(CudaImage &res, CudaImage &src)
-{
-  if (res.d_data==NULL || src.d_data==NULL) {
-    printf("ScaleUp: missing data\n");
-    return 0.0;
-  }
-  dim3 blocks(iDivUp(res.width, SCALEUP_W), iDivUp(res.height, SCALEUP_H));
-  dim3 threads(SCALEUP_W/2, SCALEUP_H/2);
-  ScaleUp<<<blocks, threads>>>(res.d_data, src.d_data, src.width, src.pitch, src.height, res.pitch); 
-  checkMsg("ScaleUp() execution failed\n");
-  return 0.0;
-}   
-
-double ComputeOrientations(cudaTextureObject_t texObj, CudaImage &src, SiftData &siftData, int octave)
-{
-  dim3 blocks(512); 
-#ifdef MANAGEDMEM
-  ComputeOrientationsCONST<<<blocks, threads>>>(texObj, siftData.m_data, octave);
-#else
-#if 1
-  dim3 threads(11*11);
-  ComputeOrientationsCONST<<<blocks, threads>>>(texObj, siftData.d_data, octave);
-#else
-  dim3 threads(256); 
-  ComputeOrientationsCONSTNew<<<blocks, threads>>>(src.d_data, src.width, src.pitch, src.height, siftData.d_data, octave);
-#endif
-#endif
-  checkMsg("ComputeOrientations() execution failed\n");
-  return 0.0;
-}
-
-double ExtractSiftDescriptors(cudaTextureObject_t texObj, SiftData &siftData, float subsampling, int octave)
-{
-  dim3 blocks(512); 
-  dim3 threads(16, 8);
-#ifdef MANAGEDMEM
-  ExtractSiftDescriptorsCONST<<<blocks, threads>>>(texObj, siftData.m_data, subsampling, octave);
-#else
-  ExtractSiftDescriptorsCONSTNew<<<blocks, threads>>>(texObj, siftData.d_data, subsampling, octave);
-#endif
-  checkMsg("ExtractSiftDescriptors() execution failed\n");
-  return 0.0; 
-}
-
-double OrientAndExtract(cudaTextureObject_t texObj, SiftData &siftData, float subsampling, int octave)
-{
-  dim3 blocks(256); 
-  dim3 threads(128);
-#ifdef MANAGEDMEM
-  OrientAndExtractCONST<<<blocks, threads>>>(texObj, siftData.m_data, subsampling, octave);
-#else
-  OrientAndExtractCONST<<<blocks, threads>>>(texObj, siftData.d_data, subsampling, octave);
-#endif
-  checkMsg("OrientAndExtract() execution failed\n");
-  return 0.0;
-}
-
-double RescalePositions(SiftData &siftData, float scale)
-{
-  dim3 blocks(iDivUp(siftData.numPts, 64));
-  dim3 threads(64);
-  RescalePositions<<<blocks, threads>>>(siftData.d_data, siftData.numPts, scale);
-  checkMsg("RescapePositions() execution failed\n");
-  return 0.0; 
-}
-
-double LowPass(CudaImage &res, CudaImage &src, float scale)
-{
-  float kernel[2*LOWPASS_R+1];
-  static float oldScale = -1.0f;
-  if (scale!=oldScale) {
-    float kernelSum = 0.0f;
-    float ivar2 = 1.0f/(2.0f*scale*scale);
-    for (int j=-LOWPASS_R;j<=LOWPASS_R;j++) {
-      kernel[j+LOWPASS_R] = (float)expf(-(double)j*j*ivar2);
-      kernelSum += kernel[j+LOWPASS_R]; 
-    }
-    for (int j=-LOWPASS_R;j<=LOWPASS_R;j++) 
-      kernel[j+LOWPASS_R] /= kernelSum;  
-    safeCall(cudaMemcpyToSymbol(d_LowPassKernel, kernel, (2*LOWPASS_R+1)*sizeof(float)));
-    oldScale = scale;
-  }  
-  int width = res.width;
-  int pitch = res.pitch;
-  int height = res.height;
-  dim3 blocks(iDivUp(width, LOWPASS_W), iDivUp(height, LOWPASS_H));
-#if 1
-  dim3 threads(LOWPASS_W+2*LOWPASS_R, 4); 
-  LowPassBlock<<<blocks, threads>>>(src.d_data, res.d_data, width, pitch, height);
-#else
-  dim3 threads(LOWPASS_W+2*LOWPASS_R, LOWPASS_H);
-  LowPass<<<blocks, threads>>>(src.d_data, res.d_data, width, pitch, height);
-#endif
-  checkMsg("LowPass() execution failed\n");
-  return 0.0; 
-}
-
-//==================== Multi-scale functions ===================//
-
-void PrepareLaplaceKernels(int numOctaves, float initBlur, float *kernel)
-{
-  if (numOctaves>1) {
-    float totInitBlur = (float)sqrt(initBlur*initBlur + 0.5f*0.5f) / 2.0f;
-    PrepareLaplaceKernels(numOctaves-1, totInitBlur, kernel);
-  }
-  float scale = pow(2.0f, -1.0f/NUM_SCALES);
-  float diffScale = pow(2.0f, 1.0f/NUM_SCALES);
-  for (int i=0;i<NUM_SCALES+3;i++) {
-    float kernelSum = 0.0f;
-    float var = scale*scale - initBlur*initBlur;
-    for (int j=0;j<=LAPLACE_R;j++) {
-      kernel[numOctaves*12*16 + 16*i + j] = (float)expf(-(double)j*j/2.0/var);
-      kernelSum += (j==0 ? 1 : 2)*kernel[numOctaves*12*16 + 16*i + j]; 
-    }
-    for (int j=0;j<=LAPLACE_R;j++)
-      kernel[numOctaves*12*16 + 16*i + j] /= kernelSum;
-    scale *= diffScale;
-  }
-}
- 
-double LaplaceMulti(cudaTextureObject_t texObj, CudaImage &baseImage, CudaImage *results, int octave) 
-{
-  int width = results[0].width;
-  int pitch = results[0].pitch;
-  int height = results[0].height;
-#if 1
-  dim3 threads(LAPLACE_W+2*LAPLACE_R);
-  dim3 blocks(iDivUp(width, LAPLACE_W), height);
-  LaplaceMultiMem<<<blocks, threads>>>(baseImage.d_data, results[0].d_data, width, pitch, height, octave);
-#endif
-#if 0
-  dim3 threads(LAPLACE_W+2*LAPLACE_R, LAPLACE_S);
-  dim3 blocks(iDivUp(width, LAPLACE_W), iDivUp(height, LAPLACE_H));
-  LaplaceMultiMemTest<<<blocks, threads>>>(baseImage.d_data, results[0].d_data, width, pitch, height, octave);
-#endif
-#if 0
-  dim3 threads(LAPLACE_W+2*LAPLACE_R, LAPLACE_S);
-  dim3 blocks(iDivUp(width, LAPLACE_W), height);
-  LaplaceMultiMemOld<<<blocks, threads>>>(baseImage.d_data, results[0].d_data, width, pitch, height, octave);
-#endif
-#if 0
-  dim3 threads(LAPLACE_W+2*LAPLACE_R, LAPLACE_S);
-  dim3 blocks(iDivUp(width, LAPLACE_W), height);
-  LaplaceMultiTex<<<blocks, threads>>>(texObj, results[0].d_data, width, pitch, height, octave);
-#endif
-  checkMsg("LaplaceMulti() execution failed\n");
-  return 0.0; 
-}
-
-double FindPointsMulti(CudaImage *sources, SiftData &siftData, float thresh, float edgeLimit, float factor, float lowestScale, float subsampling, int octave)
-{
-  if (sources->d_data==NULL) {
-    printf("FindPointsMulti: missing data\n");
-    return 0.0;
-  }
-  int w = sources->width;
-  int p = sources->pitch;
-  int h = sources->height;
-#if 0
-  dim3 blocks(iDivUp(w, MINMAX_W)*NUM_SCALES, iDivUp(h, MINMAX_H));
-  dim3 threads(MINMAX_W + 2, MINMAX_H);
-  FindPointsMultiTest<<<blocks, threads>>>(sources->d_data, siftData.d_data, w, p, h, subsampling, lowestScale, thresh, factor, edgeLimit, octave); 
-#endif
-#if 1
-  dim3 blocks(iDivUp(w, MINMAX_W)*NUM_SCALES, iDivUp(h, MINMAX_H));
-  dim3 threads(MINMAX_W + 2); 
-#ifdef MANAGEDMEM
-  FindPointsMulti<<<blocks, threads>>>(sources->d_data, siftData.m_data, w, p, h, subsampling, lowestScale, thresh, factor, edgeLimit, octave); 
-#else
-  FindPointsMultiNew<<<blocks, threads>>>(sources->d_data, siftData.d_data, w, p, h, subsampling, lowestScale, thresh, factor, edgeLimit, octave);
-#endif
-#endif
-  checkMsg("FindPointsMulti() execution failed\n");
-  return 0.0;
-}
-
diff --git a/cudaSiftH.h b/cudaSiftH.h
deleted file mode 100644
index 2919545..0000000
--- a/cudaSiftH.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef CUDASIFTH_H
-#define CUDASIFTH_H
-
-#include "cudautils.h"
-#include "cudaImage.h"
-
-//********************************************************//
-// CUDA SIFT extractor by Marten Bjorkman aka Celebrandil //
-//********************************************************//  
-
-int ExtractSiftLoop(SiftData &siftData, CudaImage &img, int numOctaves, double initBlur, float thresh, float lowestScale, float subsampling, float *memoryTmp, float *memorySub);
-void ExtractSiftOctave(SiftData &siftData, CudaImage &img, int octave, float thresh, float lowestScale, float subsampling, float *memoryTmp);
-double ScaleDown(CudaImage &res, CudaImage &src, float variance);
-double ScaleUp(CudaImage &res, CudaImage &src);
-double ComputeOrientations(cudaTextureObject_t texObj, CudaImage &src, SiftData &siftData, int octave);
-double ExtractSiftDescriptors(cudaTextureObject_t texObj, SiftData &siftData, float subsampling, int octave);
-double OrientAndExtract(cudaTextureObject_t texObj, SiftData &siftData, float subsampling, int octave);
-double RescalePositions(SiftData &siftData, float scale);
-double LowPass(CudaImage &res, CudaImage &src, float scale);
-void PrepareLaplaceKernels(int numOctaves, float initBlur, float *kernel);
-double LaplaceMulti(cudaTextureObject_t texObj, CudaImage &baseImage, CudaImage *results, int octave);
-double FindPointsMulti(CudaImage *sources, SiftData &siftData, float thresh, float edgeLimit, float factor, float lowestScale, float subsampling, int octave);
-
-#endif
diff --git a/cudautils.h b/cudautils.h
deleted file mode 100644
index cd87ddb..0000000
--- a/cudautils.h
+++ /dev/null
@@ -1,138 +0,0 @@
-#ifndef CUDAUTILS_H
-#define CUDAUTILS_H
-
-#include <cstdio>
-#include <iostream>
-
-#ifdef WIN32
-#include <intrin.h>
-#endif
-
-#define safeCall(err)       __safeCall(err, __FILE__, __LINE__)
-#define safeThreadSync()    __safeThreadSync(__FILE__, __LINE__)
-#define checkMsg(msg)       __checkMsg(msg, __FILE__, __LINE__)
-
-inline void __safeCall(cudaError err, const char *file, const int line)
-{
-  if (cudaSuccess != err) {
-    fprintf(stderr, "safeCall() Runtime API error in file <%s>, line %i : %s.\n", file, line, cudaGetErrorString(err));
-    exit(-1);
-  }
-}
-
-inline void __safeThreadSync(const char *file, const int line)
-{
-  cudaError err = cudaDeviceSynchronize();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "threadSynchronize() Driver API error in file '%s' in line %i : %s.\n", file, line, cudaGetErrorString(err));
-    exit(-1);
-  }
-}
-
-inline void __checkMsg(const char *errorMessage, const char *file, const int line)
-{
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "checkMsg() CUDA error: %s in file <%s>, line %i : %s.\n", errorMessage, file, line, cudaGetErrorString(err));
-    exit(-1);
-  }
-}
-
-inline bool deviceInit(int dev)
-{
-  int deviceCount;
-  safeCall(cudaGetDeviceCount(&deviceCount));
-  if (deviceCount == 0) {
-    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
-    return false;
-  }
-  if (dev < 0) dev = 0;						
-  if (dev > deviceCount-1) dev = deviceCount - 1;
-  cudaDeviceProp deviceProp;
-  safeCall(cudaGetDeviceProperties(&deviceProp, dev));
-  if (deviceProp.major < 1) {
-    fprintf(stderr, "error: device does not support CUDA.\n");
-    return false;					
-  }
-  safeCall(cudaSetDevice(dev));
-  return true;
-}
-
-class TimerGPU {
-public:
-  cudaEvent_t start, stop; 
-  cudaStream_t stream;
-  TimerGPU(cudaStream_t stream_ = 0) : stream(stream_) {
-    cudaEventCreate(&start); 
-    cudaEventCreate(&stop); 
-    cudaEventRecord(start, stream); 
-  }
-  ~TimerGPU() {
-    cudaEventDestroy(start); 
-    cudaEventDestroy(stop);  
-  }
-  float read() {
-    cudaEventRecord(stop, stream); 
-    cudaEventSynchronize(stop); 
-    float time;
-    cudaEventElapsedTime(&time, start, stop);
-    return time;
-  }
-};
-
-class TimerCPU
-{
-  static const int bits = 10;
-public:
-  long long beg_clock;
-  float freq;
-  TimerCPU(float freq_) : freq(freq_) {   // freq = clock frequency in MHz
-    beg_clock = getTSC(bits);
-  }
-  long long getTSC(int bits) {
-#ifdef WIN32
-    return __rdtsc()/(1LL<<bits);
-#else
-    unsigned int low, high;
-    __asm__(".byte 0x0f, 0x31" :"=a" (low), "=d" (high));
-    return ((long long)high<<(32-bits)) | ((long long)low>>bits);
-#endif
-  }
-  float read() {
-    long long end_clock = getTSC(bits);
-    long long Kcycles = end_clock - beg_clock;
-    float time = (float)(1<<bits)*Kcycles/freq/1e3f;
-    return time;
-  }
-};
-
-template <class T>
-__device__ __inline__ T ShiftDown(T var, unsigned int delta, int width = 32) {
-#if (CUDART_VERSION >= 9000)
-  return __shfl_down_sync(0xffffffff, var, delta, width);
-#else
-  return __shfl_down(var, delta, width);
-#endif
-}
-
-template <class T>
-__device__ __inline__ T ShiftUp(T var, unsigned int delta, int width = 32) {
-#if (CUDART_VERSION >= 9000)
-  return __shfl_up_sync(0xffffffff, var, delta, width);
-#else
-  return __shfl_up(var, delta, width);
-#endif
-}
-
-template <class T>
-__device__ __inline__ T Shuffle(T var, unsigned int lane, int width = 32) {
-#if (CUDART_VERSION >= 9000)
-  return __shfl_sync(0xffffffff, var, lane, width);
-#else
-  return __shfl(var, lane, width);
-#endif
-}
-
-
-#endif
-
diff --git a/mainSift.cpp b/mainSift.cpp
deleted file mode 100644
index 0e4567a..0000000
--- a/mainSift.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-//********************************************************//
-// CUDA SIFT extractor by Marten Björkman aka Celebrandil //
-//              celle @ csc.kth.se                       //
-//********************************************************//  
-
-#include <iostream>  
-#include <cmath>
-#include <iomanip>
-#include <opencv2/core/core.hpp>
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
-
-#include "cudaImage.h"
-#include "cudaSift.h"
-
-int ImproveHomography(SiftData &data, float *homography, int numLoops, float minScore, float maxAmbiguity, float thresh);
-void PrintMatchData(SiftData &siftData1, SiftData &siftData2, CudaImage &img);
-void MatchAll(SiftData &siftData1, SiftData &siftData2, float *homography);
-
-double ScaleUp(CudaImage &res, CudaImage &src);
-
-///////////////////////////////////////////////////////////////////////////////
-// Main program
-///////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) 
-{    
-  int devNum = 0, imgSet = 0;
-  if (argc>1)
-    devNum = std::atoi(argv[1]);
-  if (argc>2)
-    imgSet = std::atoi(argv[2]);
-
-  // Read images using OpenCV
-  cv::Mat limg, rimg;
-  if (imgSet) {
-    cv::imread("data/left.pgm", 0).convertTo(limg, CV_32FC1);
-    cv::imread("data/righ.pgm", 0).convertTo(rimg, CV_32FC1);
-  } else {
-    cv::imread("data/img1.png", 0).convertTo(limg, CV_32FC1);
-    cv::imread("data/img2.png", 0).convertTo(rimg, CV_32FC1);
-  }
-  //cv::flip(limg, rimg, -1);
-  unsigned int w = limg.cols;
-  unsigned int h = limg.rows;
-  std::cout << "Image size = (" << w << "," << h << ")" << std::endl;
-  
-  // Initial Cuda images and download images to device
-  std::cout << "Initializing data..." << std::endl;
-  InitCuda(devNum); 
-  CudaImage img1, img2;
-  img1.Allocate(w, h, iAlignUp(w, 128), false, NULL, (float*)limg.data);
-  img2.Allocate(w, h, iAlignUp(w, 128), false, NULL, (float*)rimg.data);
-  img1.Download();
-  img2.Download(); 
-
-  // Extract Sift features from images
-  SiftData siftData1, siftData2;
-  float initBlur = 1.0f;
-  float thresh = (imgSet ? 4.5f : 3.0f);
-  InitSiftData(siftData1, 32768, true, true); 
-  InitSiftData(siftData2, 32768, true, true);
-  
-  // A bit of benchmarking 
-  //for (int thresh1=1.00f;thresh1<=4.01f;thresh1+=0.50f) {
-  float *memoryTmp = AllocSiftTempMemory(w, h, 5, false);
-    for (int i=0;i<1000;i++) {
-      ExtractSift(siftData1, img1, 5, initBlur, thresh, 0.0f, false, memoryTmp);
-      ExtractSift(siftData2, img2, 5, initBlur, thresh, 0.0f, false, memoryTmp);
-    }
-    FreeSiftTempMemory(memoryTmp);
-    
-    // Match Sift features and find a homography
-    for (int i=0;i<1;i++)
-      MatchSiftData(siftData1, siftData2);
-    float homography[9];
-    int numMatches;
-    FindHomography(siftData1, homography, &numMatches, 10000, 0.00f, 0.80f, 5.0);
-    int numFit = ImproveHomography(siftData1, homography, 5, 0.00f, 0.80f, 3.0);
-    
-    std::cout << "Number of original features: " <<  siftData1.numPts << " " << siftData2.numPts << std::endl;
-    std::cout << "Number of matching features: " << numFit << " " << numMatches << " " << 100.0f*numFit/std::min(siftData1.numPts, siftData2.numPts) << "% " << initBlur << " " << thresh << std::endl;
-    //}
-  
-  // Print out and store summary data
-  PrintMatchData(siftData1, siftData2, img1);
-  cv::imwrite("data/limg_pts.pgm", limg);
-
-  //MatchAll(siftData1, siftData2, homography);
-  
-  // Free Sift data from device
-  FreeSiftData(siftData1);
-  FreeSiftData(siftData2);
-}
-
-void MatchAll(SiftData &siftData1, SiftData &siftData2, float *homography)
-{
-#ifdef MANAGEDMEM
-  SiftPoint *sift1 = siftData1.m_data;
-  SiftPoint *sift2 = siftData2.m_data;
-#else
-  SiftPoint *sift1 = siftData1.h_data;
-  SiftPoint *sift2 = siftData2.h_data;
-#endif
-  int numPts1 = siftData1.numPts;
-  int numPts2 = siftData2.numPts;
-  int numFound = 0;
-#if 1
-  homography[0] = homography[4] = -1.0f;
-  homography[1] = homography[3] = homography[6] = homography[7] = 0.0f;
-  homography[2] = 1279.0f;
-  homography[5] = 959.0f;
-#endif
-  for (int i=0;i<numPts1;i++) {
-    float *data1 = sift1[i].data;
-    std::cout << i << ":" << sift1[i].scale << ":" << (int)sift1[i].orientation << " " << sift1[i].xpos << " " << sift1[i].ypos << std::endl;
-    bool found = false;
-    for (int j=0;j<numPts2;j++) {
-      float *data2 = sift2[j].data;
-      float sum = 0.0f;
-      for (int k=0;k<128;k++) 
-	sum += data1[k]*data2[k];    
-      float den = homography[6]*sift1[i].xpos + homography[7]*sift1[i].ypos + homography[8];
-      float dx = (homography[0]*sift1[i].xpos + homography[1]*sift1[i].ypos + homography[2]) / den - sift2[j].xpos;
-      float dy = (homography[3]*sift1[i].xpos + homography[4]*sift1[i].ypos + homography[5]) / den - sift2[j].ypos;
-      float err = dx*dx + dy*dy;
-      if (err<100.0f) // 100.0
-	found = true;
-      if (err<100.0f || j==sift1[i].match) { // 100.0
-	if (j==sift1[i].match && err<100.0f)
-	  std::cout << " *";
-	else if (j==sift1[i].match) 
-	  std::cout << " -";
-	else if (err<100.0f)
-	  std::cout << " +";
-	else
-	  std::cout << "  ";
-	std::cout << j << ":" << sum << ":" << (int)sqrt(err) << ":" << sift2[j].scale << ":" << (int)sift2[j].orientation << " " << sift2[j].xpos << " " << sift2[j].ypos << " " << (int)dx << " " << (int)dy << std::endl;
-      }
-    }
-    std::cout << std::endl;
-    if (found)
-      numFound++;
-  }
-  std::cout << "Number of finds: " << numFound << " / " << numPts1 << std::endl;
-  std::cout << homography[0] << " " << homography[1] << " " << homography[2] << std::endl;//%%%
-  std::cout << homography[3] << " " << homography[4] << " " << homography[5] << std::endl;//%%%
-  std::cout << homography[6] << " " << homography[7] << " " << homography[8] << std::endl;//%%%
-}
-
-void PrintMatchData(SiftData &siftData1, SiftData &siftData2, CudaImage &img)
-{
-  int numPts = siftData1.numPts;
-#ifdef MANAGEDMEM
-  SiftPoint *sift1 = siftData1.m_data;
-  SiftPoint *sift2 = siftData2.m_data;
-#else
-  SiftPoint *sift1 = siftData1.h_data;
-  SiftPoint *sift2 = siftData2.h_data;
-#endif
-  float *h_img = img.h_data;
-  int w = img.width;
-  int h = img.height;
-  std::cout << std::setprecision(3);
-  for (int j=0;j<numPts;j++) { 
-    int k = sift1[j].match;
-    if (sift1[j].match_error<5) {
-      float dx = sift2[k].xpos - sift1[j].xpos;
-      float dy = sift2[k].ypos - sift1[j].ypos;
-#if 0
-      if (false && sift1[j].xpos>550 && sift1[j].xpos<600) {
-	std::cout << "pos1=(" << (int)sift1[j].xpos << "," << (int)sift1[j].ypos << ") ";
-	std::cout << j << ": " << "score=" << sift1[j].score << "  ambiguity=" << sift1[j].ambiguity << "  match=" << k << "  ";
-	std::cout << "scale=" << sift1[j].scale << "  ";
-	std::cout << "error=" << (int)sift1[j].match_error << "  ";
-	std::cout << "orient=" << (int)sift1[j].orientation << "," << (int)sift2[k].orientation << "  ";
-	std::cout << " delta=(" << (int)dx << "," << (int)dy << ")" << std::endl;
-      }
-#endif
-#if 1
-      int len = (int)(fabs(dx)>fabs(dy) ? fabs(dx) : fabs(dy));
-      for (int l=0;l<len;l++) {
-	int x = (int)(sift1[j].xpos + dx*l/len);
-	int y = (int)(sift1[j].ypos + dy*l/len);
-	h_img[y*w+x] = 255.0f;
-      }
-#endif
-    }
-    int x = (int)(sift1[j].xpos+0.5);
-    int y = (int)(sift1[j].ypos+0.5);
-    int s = std::min(x, std::min(y, std::min(w-x-2, std::min(h-y-2, (int)(1.41*sift1[j].scale)))));
-    int p = y*w + x;
-    p += (w+1);
-    for (int k=0;k<s;k++) 
-      h_img[p-k] = h_img[p+k] = h_img[p-k*w] = h_img[p+k*w] = 0.0f;
-    p -= (w+1);
-    for (int k=0;k<s;k++) 
-      h_img[p-k] = h_img[p+k] = h_img[p-k*w] =h_img[p+k*w] = 255.0f;
-  }
-  std::cout << std::setprecision(6);
-}
-
-
diff --git a/match.cu b/match.cu
deleted file mode 100644
index 02e6dc7..0000000
--- a/match.cu
+++ /dev/null
@@ -1,1081 +0,0 @@
-//**********************************************************//
-//   Matching test code by Marten Bjorkman aka Celebrandil  //
-//                                                          //
-//   The code includes an example of gradual optimization   //
-//   of a kernel for matching two sets of 16K 128D points.  //
-//   You are welcome to the code for educational purposes.  //
-//                                                          //
-//            Fairlight - When Dreams Come True             //
-// https://www.youtube.com/channel/UCdHiji77FlppuNK6xemrsVA //
-//**********************************************************//
-
-#include <cuda.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <vector>
-#include <memory>
-#include <algorithm>
-#include <immintrin.h>
-#include "cudautils.h"
-
-#define RUNCPU 1
-#define CHECK  1
-#define NPTS (2048*8)
-#define NDIM 128
-
-#define M1W  128
-#define M2W   16
-#define M2H   16
-#define M5W   16
-#define M5H   16
-#define M5R    4
-#define M7W   32
-#define M7H   32
-#define M7R    4
-
-
-/*
-Data size:   16 MB
-Allocate:    1.01194 ms
-Upload:      3.69939 ms  4.32503 MB/ms
-MatchCPU1:   34649.6 ms  1.89139 Gflops
-MatchCPU2:   3064.36 ms  21.3866 Gflops
-MatchCPU3:   184.762 ms  354.706 Gflops
-MatchGPU1:   641.828 ms  102.108 Gflops
-MatchGPU2:   148.020 ms  442.752 Gflops
-MatchGPU3:   31.9609 ms  2050.50 Gflops
-MatchGPU4:   29.7891 ms  2200.00 Gflops
-MatchGPU5:   17.1484 ms  3821.69 Gflops
-MatchGPU6:   16.3516 ms  4007.94 Gflops
-MatchGPU7:   14.7995 ms  4428.27 Gflops
-MatchGPU8:   10.5291 ms  6224.28 Gflops
-Download:    0.16016 ms  0.780488 MB/ms
-*/
-
-void MatchC1(float *h_pts1, float *h_pts2, float *h_score, int *h_index)
-{
-  std::memset(h_score, 0, sizeof(float)*NPTS);
-  for (int p1=0;p1<NPTS;p1++) {
-    for (int p2=0;p2<NPTS;p2++) {
-      float score = 0.0f;
-      for (int d=0;d<NDIM;d++)
-	score += h_pts1[p1*NDIM + d]*h_pts2[p2*NDIM + d];
-      if (score>h_score[p1]) {
-	h_score[p1] = score;
-	h_index[p1] = p2;
-      }
-    }
-  }
-}
-
-void MatchC2(float *h_pts1, float *h_pts2, float *h_score, int *h_index)
-{
-#define BSIZ  256
-  std::memset(h_score, 0, sizeof(float)*NPTS);
-  for (int b1=0;b1<NPTS;b1+=BSIZ) {
-    for (int b2=0;b2<NPTS;b2+=BSIZ) {
-      for (int p1=b1;p1<b1+BSIZ;p1++) {
-	float *pt1 = &h_pts1[p1*NDIM];
-	for (int p2=b2;p2<b2+BSIZ;p2++) {
-	  float *pt2 = &h_pts2[p2*NDIM];
-	  __m256 score8 = _mm256_setzero_ps();
-	  for (int d=0;d<NDIM;d+=8) {
-	    __m256 v1 = _mm256_load_ps(pt1 + d);
-	    __m256 v2 = _mm256_load_ps(pt2 + d);
-	    score8 = _mm256_fmadd_ps(v1, v2, score8);
-	  }
-	  score8 = _mm256_add_ps(score8, _mm256_permute2f128_ps(score8, score8, 1));
-	  score8 = _mm256_hadd_ps(score8, score8);
-	  float score = _mm256_cvtss_f32(_mm256_hadd_ps(score8, score8));
-	  if (score>h_score[p1]) {
-	    h_score[p1] = score;
-	    h_index[p1] = p2;
-	  }
-	}
-      }
-    }
-  }
-}
-
-void MatchC3(float *h_pts1, float *h_pts2, float *h_score, int *h_index)
-{
-#define BSIZ  256
-  std::memset(h_score, 0, sizeof(float)*NPTS);
-#pragma omp parallel for
-  for (int b1=0;b1<NPTS;b1+=BSIZ) {
-    for (int b2=0;b2<NPTS;b2+=BSIZ) {
-      for (int p1=b1;p1<b1+BSIZ;p1++) {
-	float *pt1 = &h_pts1[p1*NDIM];
-	for (int p2=b2;p2<b2+BSIZ;p2++) {
-	  float *pt2 = &h_pts2[p2*NDIM];
-	  __m256 score8 = _mm256_setzero_ps();
-	  for (int d=0;d<NDIM;d+=8) {
-	    __m256 v1 = _mm256_load_ps(pt1 + d);
-	    __m256 v2 = _mm256_load_ps(pt2 + d);
-	    score8 = _mm256_fmadd_ps(v1, v2, score8);
-	  }
-	  score8 = _mm256_add_ps(score8, _mm256_permute2f128_ps(score8, score8, 1));
-	  score8 = _mm256_hadd_ps(score8, score8);
-	  float score = _mm256_cvtss_f32(_mm256_hadd_ps(score8, score8));
-	  if (score>h_score[p1]) {
-	    h_score[p1] = score;
-	    h_index[p1] = p2;
-	  }
-	}
-      }
-    }
-  }
-}
-
-void CheckMatches(int *h_index, int *h_index2, float *h_score, float *h_score2)
-{
-  int ndiff = 0;
-  for (int i=0;i<NPTS;i++) {
-    ndiff += (h_index[i] != h_index2[i]);
-    if (h_index[i] != h_index2[i])
-      std::cout << "  " << i << " " << h_index[i] << " " << h_index2[i] << " " << h_score[i] << " " << h_score2[i] << std::endl;
-  }
-  std::cout << "Number of incorrect matches: " << ndiff << std::endl;
-}
-      
-
-__global__ void Match1(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-  int p1 = threadIdx.x + M1W*blockIdx.x;
-  float max_score = 0.0f;
-  int index = -1;
-  
-  for (int p2=0;p2<NPTS;p2++) {
-    float score = 0.0f;
-    for (int d=0;d<NDIM;d++)
-      score += d_pts1[p1*NDIM + d]*d_pts2[p2*NDIM + d];
-    if (score>max_score) {
-      max_score = score;
-      index = p2;
-    }
-  }
-  
-  d_score[p1] = max_score;
-  d_index[p1] = index;
-}
-
-__global__ void Match2(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-  __shared__ float buffer1[M2W*NDIM];  //%%%%
-  __shared__ float buffer2[M2H*NDIM];  //%%%%
-  __shared__ float scores[M2W*M2H];    //%%%%
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int idx = tx + M2W*ty;
-  int bp1 = M2W*blockIdx.x;
-  if (ty<M2W)
-    for (int d=tx;d<NDIM;d+=M2W)
-      for (int j=ty;j<M2W;j+=M2H)
-	buffer1[j*NDIM + d] = d_pts1[(bp1 + j)*NDIM + d];   //%%%%
-  __syncthreads();
-  
-  float max_score = 0.0f;
-  int index = -1;
-  for (int bp2=0;bp2<NPTS;bp2+=M2H) {
-    for (int d=tx;d<NDIM;d+=M2W)
-      buffer2[ty*NDIM + d] = d_pts2[(bp2 + ty)*NDIM + d]; //%%%%
-    __syncthreads();
-
-    float score = 0.0f;
-    for (int d=0;d<NDIM;d++) 
-      score += buffer1[tx*NDIM + d]*buffer2[ty*NDIM + d];   //%%%%
-    scores[idx] = score;
-    __syncthreads();
-    
-    if (ty==0) {
-      for (int i=0;i<M2H;i++) {
-	if (scores[i*M2W + tx]>max_score) {
-	  max_score = scores[i*M2W + tx];
-	  index = bp2 + i;
-	}
-      }
-    }
-    __syncthreads();
-  }
-  
-  if (ty==0) {
-    d_score[bp1 + tx] = max_score;
-    d_index[bp1 + tx] = index;
-  }
-}
-
-
-__global__ void Match3(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-  __shared__ float buffer1[M2W*(NDIM + 1)]; //%%%%
-  __shared__ float buffer2[M2H*NDIM];
-  __shared__ float scores[M2W*M2H];
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int idx = tx + M2W*ty;
-  int bp1 = M2W*blockIdx.x;
-  if (ty<M2W)
-    for (int d=tx;d<NDIM;d+=M2W)
-      for (int j=ty;j<M2W;j+=M2H)
-	buffer1[j*(NDIM + 1) + d] = d_pts1[(bp1 + j)*NDIM + d]; //%%%%
-  __syncthreads();
-  
-  float max_score = 0.0f;
-  int index = -1;
-  for (int bp2=0;bp2<NPTS;bp2+=M2H) {
-    for (int d=tx;d<NDIM;d+=M2W)
-      buffer2[ty*NDIM + d] = d_pts2[(bp2 + ty)*NDIM + d];
-    __syncthreads();
-
-    float score = 0.0f;
-    for (int d=0;d<NDIM;d++) 
-      score += buffer1[tx*(NDIM + 1) + d]*buffer2[ty*NDIM + d]; //%%%%
-    scores[idx] = score;
-    __syncthreads();
-    
-    if (ty==0) {
-      for (int i=0;i<M2H;i++) {
-	if (scores[i*M2W + tx]>max_score) {
-	  max_score = scores[i*M2W + tx];
-	  index = bp2 + i;
-	}
-      }
-    }
-    __syncthreads();
-  }
-  
-  if (ty==0) {
-    d_score[bp1 + tx] = max_score;
-    d_index[bp1 + tx] = index;
-  }
-}
-
-
-__global__ void Match4(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-  __shared__ float4 buffer1[M2W*(NDIM/4 + 1)];  //%%%%
-  __shared__ float4 buffer2[M2H*NDIM/4];        //%%%%
-  __shared__ float scores[M2W*M2H];
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int idx = tx + M2W*ty;
-  int bp1 = M2W*blockIdx.x;
-  if (ty<M2W)
-    for (int d=tx;d<NDIM/4;d+=M2W)
-      for (int j=ty;j<M2W;j+=M2H)
-	buffer1[j*(NDIM/4 + 1) + d] = ((float4*)d_pts1)[(bp1 + j)*(NDIM/4) + d]; //%%%%
-  __syncthreads();
-  
-  float max_score = 0.0f;
-  int index = -1;
-  for (int bp2=0;bp2<NPTS;bp2+=M2H) {
-    for (int d=tx;d<NDIM/4;d+=M2W)
-      buffer2[ty*NDIM/4 + d] = ((float4*)d_pts2)[(bp2 + ty)*(NDIM/4) + d]; //%%%%
-    __syncthreads();
-
-    float score = 0.0f;
-    for (int d=0;d<NDIM/4;d++) {
-      float4 v1 = buffer1[tx*(NDIM/4 + 1) + d]; //%%%%
-      float4 v2 = buffer2[ty*(NDIM/4) + d];     //%%%%
-      score += v1.x*v2.x; score += v1.y*v2.y;
-      score += v1.z*v2.z; score += v1.w*v2.w;
-    }
-    scores[idx] = score;
-    __syncthreads();
-    
-    if (ty==0) {
-      for (int i=0;i<M2H;i++) {
-	if (scores[i*M2W + tx]>max_score) {
-	  max_score = scores[i*M2W + tx];
-	  index = bp2 + i;
-	}
-      }
-    }
-    __syncthreads();
-  }
-  
-  if (ty==0) {
-    d_score[bp1 + tx] = max_score;
-    d_index[bp1 + tx] = index;
-  }
-}
-
-__global__ void Match5(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-  __shared__ float4 buffer1[M5W*(NDIM/4 + 1)]; 
-  __shared__ float4 buffer2[M5H*NDIM/4];       
-  __shared__ float scores[M5W*M5H];
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int bp1 = M5W*blockIdx.x;
-  if (ty<M5W)
-    for (int d=tx;d<NDIM/4;d+=M5W)
-      for (int j=ty;j<M5W;j+=M5H)
-	buffer1[j*(NDIM/4 + 1) + d] = ((float4*)d_pts1)[(bp1 + j)*(NDIM/4) + d];
-  __syncthreads();
-  
-  float max_score = 0.0f;
-  int index = -1;
-  for (int bp2=0;bp2<NPTS;bp2+=M5H) {
-    for (int d=tx;d<NDIM/4;d+=M5W)
-      buffer2[ty*NDIM/4 + d] = ((float4*)d_pts2)[(bp2 + ty)*(NDIM/4) + d];
-    __syncthreads();
-
-    if (ty<M5H/M5R) {  //%%%%
-      float score[M5R];                                    //%%%%
-      for (int dy=0;dy<M5R;dy++)
-	score[dy] = 0.0f;
-      for (int d=0;d<NDIM/4;d++) {
-	float4 v1 = buffer1[tx*(NDIM/4 + 1) + d];
-	for (int dy=0;dy<M5R;dy++) {
-	  float4 v2 = buffer2[(M5R*ty + dy)*(NDIM/4) + d];    //%%%%
-	  score[dy] += v1.x*v2.x; score[dy] += v1.y*v2.y;
-	  score[dy] += v1.z*v2.z; score[dy] += v1.w*v2.w;
-	}
-      }
-      for (int dy=0;dy<M5R;dy++)
-	scores[tx + M5W*(M5R*ty + dy)] = score[dy];
-    }
-    __syncthreads();
-    
-    if (ty==0) {
-      for (int i=0;i<M5H;i++) {
-	if (scores[i*M2W + tx]>max_score) {
-	  max_score = scores[i*M5W + tx];
-	  index = bp2 + i;
-	}
-      }
-    }
-    __syncthreads();
-  }
-
-  if (ty==0) {
-    d_score[bp1 + tx] = max_score;
-    d_index[bp1 + tx] = index;
-  }
-}
-
-
-__global__ void Match6(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-  __shared__ float4 buffer1[M5W*(NDIM/4 + 1)]; 
-  __shared__ float4 buffer2[M5H*NDIM/4];       
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int bp1 = M5W*blockIdx.x;
-  if (ty<M5W)
-    for (int d=tx;d<NDIM/4;d+=M5W)
-      for (int j=ty;j<M5W;j+=M5H)
-	buffer1[j*(NDIM/4 + 1) + d] = ((float4*)d_pts1)[(bp1 + j)*(NDIM/4) + d];
-  
-  float max_score = 0.0f;
-  int index = -1;    
-  for (int bp2=0;bp2<NPTS;bp2+=M5H) {
-    for (int d=tx;d<NDIM/4;d+=M5W)
-      buffer2[ty*NDIM/4 + d] = ((float4*)d_pts2)[(bp2 + ty)*(NDIM/4) + d];
-    __syncthreads();
-
-    if (ty<M5H/M5R) {  
-      float score[M5R];                                    
-      for (int dy=0;dy<M5R;dy++)
-	score[dy] = 0.0f;
-      for (int d=0;d<NDIM/4;d++) {
-	float4 v1 = buffer1[tx*(NDIM/4 + 1) + d];
-	for (int dy=0;dy<M5R;dy++) {
-	  float4 v2 = buffer2[(M5R*ty + dy)*(NDIM/4) + d];    
-	  score[dy] += v1.x*v2.x; score[dy] += v1.y*v2.y;
-	  score[dy] += v1.z*v2.z; score[dy] += v1.w*v2.w;
-	}
-      }
-      for (int dy=0;dy<M5R;dy++) {
-	if (score[dy]>max_score) {   //%%%%
-	  max_score = score[dy];     
-	  index = bp2 + M5R*ty + dy;               
-	}
-      }
-    }
-    __syncthreads();
-  }
-
-  float *scores = (float*)buffer1;
-  int *indices = (int*)&scores[M5W*M5H/M5R];
-  if (ty<M5H/M5R) {
-    scores[ty*M5W + tx] = max_score;  //%%%%
-    indices[ty*M5W + tx] = index;     //%%%%
-  }
-  __syncthreads();
-  
-  if (ty==0) {
-    max_score = scores[tx];
-    index = indices[tx];
-    for (int y=0;y<M5H/M5R;y++)
-      if (scores[y*M5W + tx]>max_score) {
-	max_score = scores[y*M5W + tx]; //%%%%
-	index = indices[y*M5W + tx];    //%%%%
-      }
-    d_score[bp1 + tx] = max_score;
-    d_index[bp1 + tx] = index;
-  }
-}
-
-__global__ void Match7(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-  __shared__ float4 buffer1[M7W*NDIM/4]; //%%%%
-  __shared__ float4 buffer2[M7H*NDIM/4];       
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int bp1 = M7W*blockIdx.x;
-  for (int d=tx;d<NDIM/4;d+=M7W)
-    for (int j=ty;j<M7W;j+=M7H/M7R)      //%%%%
-      buffer1[j*NDIM/4 + (d + j)%(NDIM/4)] = ((float4*)d_pts1)[(bp1 + j)*(NDIM/4) + d];
-  
-  float max_score = 0.0f;
-  int index = -1;    
-  for (int bp2=0;bp2<NPTS;bp2+=M7H) {
-    for (int d=tx;d<NDIM/4;d+=M7W)
-      for (int j=ty;j<M7H;j+=M7H/M7R)       //%%%%
-	buffer2[j*NDIM/4 + d] = ((float4*)d_pts2)[(bp2 + j)*(NDIM/4) + d];
-    __syncthreads();
-
-    float score[M7R];                                    
-    for (int dy=0;dy<M7R;dy++)
-      score[dy] = 0.0f;
-    for (int d=0;d<NDIM/4;d++) {
-      float4 v1 = buffer1[tx*NDIM/4 + (d + tx)%(NDIM/4)];
-      for (int dy=0;dy<M7R;dy++) {
-	float4 v2 = buffer2[(M7R*ty + dy)*(NDIM/4) + d];    
-	score[dy] += v1.x*v2.x; score[dy] += v1.y*v2.y;
-	score[dy] += v1.z*v2.z; score[dy] += v1.w*v2.w;
-      }
-    }
-    for (int dy=0;dy<M7R;dy++) {
-      if (score[dy]>max_score) {   
-	max_score = score[dy];     
-	index = bp2 + M7R*ty + dy;               
-      }
-    }
-    __syncthreads();
-  }
-
-  float *scores = (float*)buffer1;
-  int *indices = (int*)&scores[M7W*M7H/M7R];
-  scores[ty*M7W + tx] = max_score;  
-  indices[ty*M7W + tx] = index;     
-  __syncthreads();
-  
-  if (ty==0) {
-    max_score = scores[tx];
-    index = indices[tx];
-    for (int y=0;y<M7H/M7R;y++)
-      if (scores[y*M7W + tx]>max_score) {
-	max_score = scores[y*M7W + tx]; 
-	index = indices[y*M7W + tx];    
-      }
-    d_score[bp1 + tx] = max_score;
-    d_index[bp1 + tx] = index;
-  }
-}
-
-__global__ void Match8(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-  __shared__ float4 buffer1[M7W*NDIM/4]; 
-  __shared__ float4 buffer2[M7H*NDIM/4];       
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int bp1 = M7W*blockIdx.x;
-  for (int d=tx;d<NDIM/4;d+=M7W)
-    for (int j=ty;j<M7W;j+=M7H/M7R)     
-      buffer1[j*NDIM/4 + (d + j)%(NDIM/4)] = ((float4*)d_pts1)[(bp1 + j)*(NDIM/4) + d];
-
-#define NRX 2
-  float max_score[NRX];
-  int index[NRX];
-  for (int i=0;i<NRX;i++) {
-    max_score[i] = 0.0f;
-    index[i] = -1;
-  }
-  int idx = ty*M7W + tx;
-  int ix = idx%(M7W/NRX);
-  int iy = idx/(M7W/NRX);
-  for (int bp2=0;bp2<NPTS;bp2+=M7H) {
-    for (int d=tx;d<NDIM/4;d+=M7W)
-      for (int j=ty;j<M7H;j+=M7H/M7R)       
-	buffer2[j*NDIM/4 + d] = ((float4*)d_pts2)[(bp2 + j)*(NDIM/4) + d];
-    __syncthreads();
-
-    if (idx<M7W*M7H/M7R/NRX) {
-      float score[M7R][NRX];                                    
-      for (int dy=0;dy<M7R;dy++)
-	for (int i=0;i<NRX;i++)
-	  score[dy][i] = 0.0f;
-      for (int d=0;d<NDIM/4;d++) {
-	float4 v1[NRX];
-	for (int i=0;i<NRX;i++) 
-	  v1[i] = buffer1[((M7W/NRX)*i + ix)*NDIM/4 + (d + (M7W/NRX)*i + ix)%(NDIM/4)];
-	for (int dy=0;dy<M7R;dy++) {
-	  float4 v2 = buffer2[(M7R*iy + dy)*(NDIM/4) + d];    
-	  for (int i=0;i<NRX;i++) {
-	    score[dy][i] += v1[i].x*v2.x;
-	    score[dy][i] += v1[i].y*v2.y;
-	    score[dy][i] += v1[i].z*v2.z;
-	    score[dy][i] += v1[i].w*v2.w;
-	  }
-	}
-      }
-      for (int dy=0;dy<M7R;dy++) {
-	for (int i=0;i<NRX;i++) {
-	  if (score[dy][i]>max_score[i]) {
-	    max_score[i] = score[dy][i];     
-	    index[i] = bp2 + M7R*iy + dy;
-	  }
-	}
-      }
-    }
-    __syncthreads();
-  }
-
-  float *scores = (float*)buffer1;
-  int *indices = (int*)&scores[M7W*M7H/M7R];
-  if (idx<M7W*M7H/M7R/NRX) {
-    for (int i=0;i<NRX;i++) {
-      scores[iy*M7W + (M7W/NRX)*i + ix] = max_score[i];  
-      indices[iy*M7W + (M7W/NRX)*i + ix] = index[i];
-    }
-  }
-  __syncthreads();
-  
-  if (ty==0) {
-    float max_score = scores[tx];
-    int index = indices[tx];
-    for (int y=0;y<M7H/M7R;y++)
-      if (scores[y*M7W + tx]>max_score) {
-	max_score = scores[y*M7W + tx]; 
-	index = indices[y*M7W + tx];    
-      }
-    d_score[bp1 + tx] = max_score;
-    d_index[bp1 + tx] = index;
-  }
-}
-
-__global__ void Match8small(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-#define NRX 2
-  __shared__ float4 buffer1[M7W*NDIM/4]; 
-  __shared__ float4 buffer2[M7H*NDIM/4];       
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int bp1 = M7W*blockIdx.x;
-  for (int d=tx;d<NDIM/4;d+=M7W)
-    for (int j=ty;j<M7W;j+=M7H/M7R/NRX)     
-      buffer1[j*NDIM/4 + (d + j)%(NDIM/4)] = ((float4*)d_pts1)[(bp1 + j)*(NDIM/4) + d];
-
-  float max_score[NRX];
-  int index[NRX];
-  for (int i=0;i<NRX;i++) {
-    max_score[i] = 0.0f;
-    index[i] = -1;
-  }
-  int idx = ty*M7W + tx;
-  int ix = idx%(M7W/NRX);
-  int iy = idx/(M7W/NRX);
-  for (int bp2=0;bp2<NPTS;bp2+=M7H) {
-    for (int d=tx;d<NDIM/4;d+=M7W)
-      for (int j=ty;j<M7H;j+=M7H/M7R/NRX)       
-	buffer2[j*NDIM/4 + d] = ((float4*)d_pts2)[(bp2 + j)*(NDIM/4) + d];
-    __syncthreads();
-
-    float score[M7R][NRX];                                    
-    for (int dy=0;dy<M7R;dy++)
-      for (int i=0;i<NRX;i++)
-	score[dy][i] = 0.0f;
-    for (int d=0;d<NDIM/4;d++) {
-      float4 v1[NRX];
-      for (int i=0;i<NRX;i++) 
-	v1[i] = buffer1[((M7W/NRX)*i + ix)*NDIM/4 + (d + (M7W/NRX)*i + ix)%(NDIM/4)];
-      for (int dy=0;dy<M7R;dy++) {
-	float4 v2 = buffer2[(M7R*iy + dy)*(NDIM/4) + d];    
-	for (int i=0;i<NRX;i++) {
-	  score[dy][i] += v1[i].x*v2.x;
-	  score[dy][i] += v1[i].y*v2.y;
-	  score[dy][i] += v1[i].z*v2.z;
-	  score[dy][i] += v1[i].w*v2.w;
-	}
-      }
-    }
-    for (int dy=0;dy<M7R;dy++) {
-      for (int i=0;i<NRX;i++) {
-	if (score[dy][i]>max_score[i]) {
-	  max_score[i] = score[dy][i];     
-	  index[i] = bp2 + M7R*iy + dy;
-	}
-      }
-    }
-    __syncthreads();
-  }
-
-  float *scores = (float*)buffer1;
-  int *indices = (int*)&scores[M7W*M7H/M7R];
-  if (idx<M7W*M7H/M7R/NRX) {
-    for (int i=0;i<NRX;i++) {
-      scores[iy*M7W + (M7W/NRX)*i + ix] = max_score[i];  
-      indices[iy*M7W + (M7W/NRX)*i + ix] = index[i];
-    }
-  }
-  __syncthreads();
-  
-  if (ty==0) {
-    float max_score = scores[tx];
-    int index = indices[tx];
-    for (int y=0;y<M7H/M7R;y++)
-      if (scores[y*M7W + tx]>max_score) {
-	max_score = scores[y*M7W + tx]; 
-	index = indices[y*M7W + tx];    
-      }
-    d_score[bp1 + tx] = max_score;
-    d_index[bp1 + tx] = index;
-  }
-}
-
-__global__ void Match8blocked(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-#define NRX 2
-#define NUM (NRX*M7R)                       // 32*8 threads
-  __shared__ float4 buffer1[M7W*NDIM/4];    // 32*32
-  __shared__ float4 buffer2[M7H*NUM];       // 32*8
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int bp1 = M7W*blockIdx.x;
-  for (int d=tx;d<NDIM/4;d+=M7W)
-    for (int j=ty;j<M7W;j+=M7H/M7R)     
-      buffer1[j*NDIM/4 + (d + j)%(NDIM/4)] = ((float4*)d_pts1)[(bp1 + j)*(NDIM/4) + d];
-
-  float max_score[NRX];
-  int index[NRX];
-  for (int i=0;i<NRX;i++) {
-    max_score[i] = 0.0f;
-    index[i] = -1;
-  }
-  int idx = ty*M7W + tx;
-  int ix = idx%(M7W/NRX);
-  int iy = idx/(M7W/NRX);
-  for (int bp2=0;bp2<NPTS;bp2+=M7H) {
-    float score[M7R][NRX];                                    
-    for (int dy=0;dy<M7R;dy++)
-      for (int i=0;i<NRX;i++)
-	score[dy][i] = 0.0f;
-
-    int d = (idx%NUM);
-    int j = (idx/NUM);
-    buffer2[j*NUM + d] = ((float4*)d_pts2)[(bp2 + j)*(NDIM/4) + d];
-    __syncthreads();
-    for (int dp=0;dp<NDIM/4;dp+=NUM) {
-      float4 temp;
-      if (dp<(NDIM/4-NUM))
-	temp = ((float4*)d_pts2)[(bp2 + j)*(NDIM/4) + dp + d + NUM];
-
-      if (idx<M7W*M7H/M7R/NRX) {
-	for (int d=0;d<NUM;d++) {
-	  float4 v1[NRX];
-#pragma unroll
-	  for (int i=0;i<NRX;i++) 
-	    v1[i] = buffer1[(((M7W/NRX)*i + ix)<<5) + ((dp + d + (M7W/NRX)*i + ix)&31)];
-	  //v1[i] = buffer1[((M7W/NRX)*i + ix)*NDIM/4 + (dp + d + (M7W/NRX)*i + ix)%(NDIM/4)];
-#pragma unroll
-	  for (int dy=0;dy<M7R;dy++) {
-	    float4 v2 = buffer2[(M7R*iy + dy)*NUM + d];    
-#pragma unroll
-	    for (int i=0;i<NRX;i++) {
-	      score[dy][i] += v1[i].x*v2.x;
-	      score[dy][i] += v1[i].y*v2.y;
-	      score[dy][i] += v1[i].z*v2.z;
-	      score[dy][i] += v1[i].w*v2.w;
-	    }
-	  }
-	}
-      }
-      __syncthreads();
-
-      if (dp<(NDIM/4-NUM)) {
-	buffer2[j*NUM + d] = temp;
-	__syncthreads();
-      }
-    }
-    for (int dy=0;dy<M7R;dy++) {
-      for (int i=0;i<NRX;i++) {
-	if (score[dy][i]>max_score[i]) {
-	  max_score[i] = score[dy][i];     
-	  index[i] = bp2 + M7R*iy + dy;
-	}
-      }
-    }
-    __syncthreads();
-  }
-
-  float *scores = (float*)buffer1;
-  int *indices = (int*)&scores[M7W*M7H/M7R];
-  if (idx<M7W*M7H/M7R/NRX) {
-    for (int i=0;i<NRX;i++) {
-      scores[iy*M7W + (M7W/NRX)*i + ix] = max_score[i];  
-      indices[iy*M7W + (M7W/NRX)*i + ix] = index[i];
-    }
-  }
-  __syncthreads();
-  
-  if (ty==0) {
-    float max_score = scores[tx];
-    int index = indices[tx];
-    for (int y=0;y<M7H/M7R;y++)
-      if (scores[y*M7W + tx]>max_score) {
-	max_score = scores[y*M7W + tx]; 
-	index = indices[y*M7W + tx];    
-      }
-    d_score[bp1 + tx] = max_score;
-    d_index[bp1 + tx] = index;
-  }
-}
-
-__global__ void Match8blocked2(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-#define NRX 2
-#define NUM (NRX*M7R)                       // 32*8 threads
-  __shared__ float4 buffer1[M7W*NDIM/4];    // 32*32
-  __shared__ float4 buffer2[M7H*NUM];       // 32*8
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int bp1 = M7W*blockIdx.x;
-  for (int d=tx;d<NDIM/4;d+=M7W)
-    for (int j=ty;j<M7W;j+=M7H/M7R)     
-      buffer1[j*NDIM/4 + (d + j)%(NDIM/4)] = ((float4*)d_pts1)[(bp1 + j)*(NDIM/4) + d];
-
-  float max_score[NRX];
-  int index[NRX];
-  for (int i=0;i<NRX;i++) {
-    max_score[i] = 0.0f;
-    index[i] = -1;
-  }
-  int idx = ty*M7W + tx;
-  int ix = idx%(M7W/NRX);
-  int iy = idx/(M7W/NRX);
-  for (int bp2=0;bp2<NPTS;bp2+=M7H) {
-    float score[M7R][NRX];                                    
-    for (int dy=0;dy<M7R;dy++)
-      for (int i=0;i<NRX;i++)
-	score[dy][i] = 0.0f;
-    for (int dp=0;dp<NDIM/4;dp+=NUM) {
-      int d = (idx%NUM);
-      int j = (idx/NUM);
-      buffer2[j*NUM + d] = ((float4*)d_pts2)[(bp2 + j)*(NDIM/4) + dp + d];
-      __syncthreads();
-
-      if (idx<M7W*M7H/M7R/NRX) {
-	for (int d=0;d<NUM;d++) {
-	  float4 v1[NRX];
-	  for (int i=0;i<NRX;i++) 
-	    v1[i] = buffer1[((M7W/NRX)*i + ix)*NDIM/4 + (dp + d + (M7W/NRX)*i + ix)%(NDIM/4)];
-	  for (int dy=0;dy<M7R;dy++) {
-	    float4 v2 = buffer2[(M7R*iy + dy)*NUM + d];    
-	    for (int i=0;i<NRX;i++) {
-	      score[dy][i] += v1[i].x*v2.x;
-	      score[dy][i] += v1[i].y*v2.y;
-	      score[dy][i] += v1[i].z*v2.z;
-	      score[dy][i] += v1[i].w*v2.w;
-	    }
-	  }
-	}
-      }
-      __syncthreads();
-    }
-    for (int dy=0;dy<M7R;dy++) {
-      for (int i=0;i<NRX;i++) {
-	if (score[dy][i]>max_score[i]) {
-	  max_score[i] = score[dy][i];     
-	  index[i] = bp2 + M7R*iy + dy;
-	}
-      }
-    }
-    __syncthreads();
-  }
-
-  float *scores = (float*)buffer1;
-  int *indices = (int*)&scores[M7W*M7H/M7R];
-  if (idx<M7W*M7H/M7R/NRX) {
-    for (int i=0;i<NRX;i++) {
-      scores[iy*M7W + (M7W/NRX)*i + ix] = max_score[i];  
-      indices[iy*M7W + (M7W/NRX)*i + ix] = index[i];
-    }
-  }
-  __syncthreads();
-  
-  if (ty==0) {
-    float max_score = scores[tx];
-    int index = indices[tx];
-    for (int y=0;y<M7H/M7R;y++)
-      if (scores[y*M7W + tx]>max_score) {
-	max_score = scores[y*M7W + tx]; 
-	index = indices[y*M7W + tx];    
-      }
-    d_score[bp1 + tx] = max_score;
-    d_index[bp1 + tx] = index;
-  }
-}
-
-__global__ void Match9(float *d_pts1, float *d_pts2, float *d_score, int *d_index)
-{
-#define NRX 2
-#define NUM 8
-  __shared__ float4 buffer1[M7W*NDIM/4]; 
-  __shared__ float4 buffer2[M7H*NUM];       
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int bp1 = M7W*blockIdx.x;
-  for (int d=tx;d<NDIM/4;d+=M7W)
-    for (int j=ty;j<M7W;j+=M7H/M7R)     
-      buffer1[j*NDIM/4 + (d + j)%(NDIM/4)] = ((float4*)d_pts1)[(bp1 + j)*(NDIM/4) + d];
-
-  float max_score[NRX];
-  int index[NRX];
-  for (int i=0;i<NRX;i++) {
-    max_score[i] = 0.0f;
-    index[i] = -1;
-  }
-  int idx = ty*M7W + tx;
-  int ix = idx%(M7W/NRX);
-  int iy = idx/(M7W/NRX);
-  for (int bp2=0;bp2<NPTS;bp2+=M7H) {
-    
-      float score[M7R][NRX];                                    
-      if (idx<M7W*M7H/M7R/NRX) {    // 128
-	for (int dy=0;dy<M7R;dy++)
-	  for (int i=0;i<NRX;i++)
-	    score[dy][i] = 0.0f;
-      }
-
-      for (int d=0;d<NDIM/4;d+=NUM) {
-	if (idx<M7H*NUM)            // 256
-	  buffer2[idx] = ((float4*)d_pts2)[(bp2 + (idx/NUM))*(NDIM/4) + d + (idx%NUM)];
-	__syncthreads();
-
-	if (idx<M7W*M7H/M7R/NRX) {  // 128
-	  for (int j=0;j<NUM;j++) {
-	    float4 v1[NRX];
-	    for (int i=0;i<NRX;i++) 
-	      v1[i] = buffer1[((M7W/NRX)*i + ix)*NDIM/4 + (d + j + (M7W/NRX)*i + ix)%(NDIM/4)];
-	    for (int dy=0;dy<M7R;dy++) {
-	      float4 v2 = buffer2[(M7R*ty + dy)*NUM + j];
-	      for (int i=0;i<NRX;i++) {
-		score[dy][i] += v1[i].x*v2.x;
-		score[dy][i] += v1[i].y*v2.y;
-		score[dy][i] += v1[i].z*v2.z;
-		score[dy][i] += v1[i].w*v2.w;
-	      }	      
-	    }
-	  }
-	}
-	__syncthreads();
-      }
-      
-      if (idx<M7W*M7H/M7R/NRX) {  // 128
-	for (int dy=0;dy<M7R;dy++) {
-	  for (int i=0;i<NRX;i++) {
-	    if (score[dy][i]>max_score[i]) {
-	      max_score[i] = score[dy][i];     
-	      index[i] = bp2 + M7R*iy + dy;
-	    }
-	  }
-	}
-      }
-      __syncthreads();
-  }
-
-  float *scores = (float*)buffer1;
-  int *indices = (int*)&scores[M7W*M7H/M7R];
-  if (idx<M7W*M7H/M7R/NRX) {
-    for (int i=0;i<NRX;i++) {
-      scores[iy*M7W + (M7W/NRX)*i + ix] = max_score[i];  
-      indices[iy*M7W + (M7W/NRX)*i + ix] = index[i];
-    }
-  }
-  __syncthreads();
-  
-  if (ty==0) {
-    float max_score = scores[tx];
-    int index = indices[tx];
-    for (int y=0;y<M7H/M7R;y++)
-      if (scores[y*M7W + tx]>max_score) {
-	max_score = scores[y*M7W + tx]; 
-	index = indices[y*M7W + tx];    
-      }
-    d_score[bp1 + tx] = max_score;
-    d_index[bp1 + tx] = index;
-  }
-}
-
-
-int main(int argc, char *argv[])
-{
-  safeCall(cudaSetDevice(0));
-
-  size_t space = sizeof(float)*NPTS*NDIM*2 + 8;
-  std::vector<float> data(NPTS*NDIM*2 + 8);
-  void *ptr = (void*)&data[0];
-  float *h_pts1 = (float*)std::align(32, sizeof(float)*NPTS*NDIM, ptr, space);
-  ptr = (void*)&data[NPTS*NDIM];
-  float *h_pts2 = (float*)std::align(32, sizeof(float)*NPTS*NDIM, ptr, space);
-  std::vector<int> h_index(NPTS);
-  std::vector<float> h_score(NPTS);
-  std::vector<int> h_index2(NPTS);
-  std::vector<float> h_score2(NPTS);
-  
-  float *d_pts1, *d_pts2, *d_score;
-  int *d_index;
-  std::cout << std::endl;
-  int psize = sizeof(float)*NPTS;
-  std::cout << "Data size:   " << 2.0*psize*NDIM/1024/1024 << " MB" << std::endl;
-  TimerGPU time;
-  float ltime = time.read();
-
-  safeCall(cudaMalloc((void **)&d_pts1, psize*NDIM));
-  safeCall(cudaMalloc((void **)&d_pts2, psize*NDIM));
-  safeCall(cudaMalloc((void **)&d_index, psize));
-  safeCall(cudaMalloc((void **)&d_score, psize));
-  std::cout << "Allocate:    " << time.read() - ltime << " ms" << std::endl;
-
-  for (int i=0;i<NPTS;i++) {
-    float sum1 = 0.0f, sum2 = 0.0f;
-    for (int d=0;d<NDIM;d++) {
-      sum1 += h_pts1[i*NDIM + d] = (float)rand()/RAND_MAX;
-      sum2 += h_pts2[i*NDIM + d] = (float)rand()/RAND_MAX;
-    }
-    sum1 = sqrt(NDIM)/sum1;
-    sum2 = sqrt(NDIM)/sum2;
-    for (int d=0;d<NDIM;d++) {
-      h_pts1[i*NDIM + d] *= sum1;
-      h_pts2[i*NDIM + d] *= sum2;
-    }
-  }
-  ltime = time.read();
-  safeCall(cudaMemcpy(d_pts1, h_pts1, psize*NDIM, cudaMemcpyHostToDevice));
-  safeCall(cudaMemcpy(d_pts2, h_pts2, psize*NDIM, cudaMemcpyHostToDevice));
-  float delay = time.read() - ltime;
-  std::cout << "Upload:      " << delay << " ms  " << 2*psize*NDIM/delay/1024/1024 << " MB/ms" << std::endl;
-
-  if (RUNCPU) {
-#if 0
-    ltime = time.read();
-    MatchC1(h_pts1, h_pts2, h_score.data(), h_index.data());
-    delay = time.read() - ltime;
-    std::cout << "MatchCPU1:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-
-    ltime = time.read();
-    MatchC2(h_pts1, h_pts2, h_score.data(), h_index.data());
-    delay = time.read() - ltime;
-    std::cout << "MatchCPU2:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-#endif
-
-    ltime = time.read();
-    MatchC3(h_pts1, h_pts2, h_score.data(), h_index.data());
-    delay = time.read() - ltime;
-    std::cout << "MatchCPU3:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-  }
-  dim3 blocks, threads;
-#if 0
-  blocks = dim3(NPTS/M1W);
-  threads = dim3(M1W);
-  ltime = time.read();
-  Match1<<<blocks,threads>>>(d_pts1, d_pts2, d_score, d_index);
-  delay = time.read() - ltime;
-  checkMsg("Match1 error");
-  std::cout << "MatchGPU1:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-
-  blocks = dim3(NPTS/M2W);
-  threads = dim3(M2W, M2H);
-  ltime = time.read();
-  Match2<<<blocks,threads>>>(d_pts1, d_pts2, d_score, d_index);
-  delay = time.read() - ltime;
-  checkMsg("Match2 error");
-  std::cout << "MatchGPU2:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-#endif  
-
-  blocks = dim3(NPTS/M2W);
-  threads = dim3(M2W, M2H);
-  ltime = time.read();
-  Match3<<<blocks,threads>>>(d_pts1, d_pts2, d_score, d_index);
-  delay = time.read() - ltime;
-  checkMsg("Match3 error");
-  std::cout << "MatchGPU3:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-  
-  blocks = dim3(NPTS/M2W);
-  threads = dim3(M2W, M2H);
-  ltime = time.read();
-  Match4<<<blocks,threads>>>(d_pts1, d_pts2, d_score, d_index);
-  delay = time.read() - ltime;
-  checkMsg("Match4 error");
-  std::cout << "MatchGPU4:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-  
-  blocks = dim3(NPTS/M5W);
-  threads = dim3(M5W, M5H);
-  ltime = time.read();
-  Match5<<<blocks,threads>>>(d_pts1, d_pts2, d_score, d_index);
-  delay = time.read() - ltime;
-  checkMsg("Match5 error");
-  std::cout << "MatchGPU5:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-  
-  blocks = dim3(NPTS/M5W);
-  threads = dim3(M5W, M5H);
-  ltime = time.read();
-  Match6<<<blocks,threads>>>(d_pts1, d_pts2, d_score, d_index);
-  delay = time.read() - ltime;
-  checkMsg("Match6 error");
-  std::cout << "MatchGPU6:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-
-  blocks = dim3(NPTS/M7W);
-  threads = dim3(M7W, M7H/M7R);
-  ltime = time.read();
-  Match7<<<blocks,threads>>>(d_pts1, d_pts2, d_score, d_index);
-  delay = time.read() - ltime;
-  checkMsg("Match7 error");
-  std::cout << "MatchGPU7:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-
-  blocks = dim3(NPTS/M7W);
-  threads = dim3(M7W, M7H/M7R);
-  ltime = time.read();
-  Match8<<<blocks,threads>>>(d_pts1, d_pts2, d_score, d_index);
-  delay = time.read() - ltime;
-  checkMsg("Match8 error");
-  std::cout << "MatchGPU8:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-  #if 1
-  blocks = dim3(NPTS/M7W);
-  threads = dim3(M7W, M7H/M7R/2);
-  ltime = time.read();
-  Match8small<<<blocks,threads>>>(d_pts1, d_pts2, d_score, d_index);
-  delay = time.read() - ltime;
-  checkMsg("Match8small error");
-  std::cout << "Match8small:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-  #endif
-  #if 1
-  blocks = dim3(NPTS/M7W);
-  threads = dim3(M7W, M7H/M7R);
-  ltime = time.read();
-  Match8blocked<<<blocks,threads>>>(d_pts1, d_pts2, d_score, d_index);
-  delay = time.read() - ltime;
-  checkMsg("Match8blocked error");
-  std::cout << "MatchGPU8blocked:   " << delay << " ms  " << 2.0*NPTS*NPTS*NDIM/delay/1024/1024 << " Gflops" << std::endl;
-  #endif
-  ltime = time.read();
-  safeCall(cudaMemcpy(h_index2.data(), d_index, psize, cudaMemcpyDeviceToHost));
-  safeCall(cudaMemcpy(h_score2.data(), d_score, psize, cudaMemcpyDeviceToHost));
-  delay = time.read() - ltime;
-  std::cout << "Download:    " << delay << " ms  " << 2*psize/delay/1024/1024 << " MB/ms" << std::endl;
-  ltime = time.read();
-
-  if (CHECK)
-    CheckMatches(h_index.data(), h_index2.data(), h_score.data(), h_score2.data());
-
-  std::cout << std::endl;
-  safeCall(cudaFree(d_pts1));
-  safeCall(cudaFree(d_pts2));
-  safeCall(cudaFree(d_index));
-  safeCall(cudaFree(d_score));
-}
diff --git a/match.pdf b/match.pdf
deleted file mode 100644
index d3eb724..0000000
Binary files a/match.pdf and /dev/null differ
diff --git a/matching.cu b/matching.cu
deleted file mode 100644
index 3dca6a0..0000000
--- a/matching.cu
+++ /dev/null
@@ -1,1207 +0,0 @@
-#include "cudaSift.h"
-#include "cudautils.h"
-
-//================= Device matching functions =====================//
-
-__global__ void MatchSiftPoints(SiftPoint *sift1, SiftPoint *sift2, float *corrData, int numPts1, int numPts2)
-{
-  __shared__ float siftPoint[128];
-  __shared__ float sums[16*16];
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int p1 = blockIdx.x;
-  const int p2 = blockIdx.y*16 + ty;
-  const float *ptr1 = sift1[p1].data;
-  const float *ptr2 = sift2[p2].data;
-  const int i = 16*ty + tx;
-  if (ty<8)
-    siftPoint[i] = ptr1[i];
-  __syncthreads();
-  float sum = 0.0f;
-  if (p2<numPts2)
-    for (int j=0;j<8;j++)
-      sum += siftPoint[16*j+tx] * ptr2[16*j+tx];
-  sums[i] = sum;
-  __syncthreads();
-  if (tx<8)
-    sums[i] += sums[i+8];
-  __syncthreads();
-  if (tx<4)
-    sums[i] += sums[i+4];
-  __syncthreads();
-  if (ty==0) {
-    sum = sums[16*tx+0] + sums[16*tx+1] + sums[16*tx+2] + sums[16*tx+3];
-    corrData[p1*gridDim.y*16 + blockIdx.y*16 + tx] = sum;
-  }
-  __syncthreads();
-}
-
-__global__ void MatchSiftPoints2(SiftPoint *sift1, SiftPoint *sift2, float *corrData, int numPts1, int numPts2)
-{
-  __shared__ float siftPoints1[16*128];
-  __shared__ float siftPoints2[16*128];
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const float *ptr1 = sift1[min(numPts1-1,blockIdx.x*16 + ty)].data;
-  const float *ptr2 = sift2[min(numPts2-1,blockIdx.y*16 + ty)].data;
-  for (int i=0;i<8;i++) {
-    siftPoints1[128*ty+16*i+tx] = ptr1[16*i+tx];
-    siftPoints2[128*ty+16*i+tx] = ptr2[16*i+tx];
-  }
-  __syncthreads();
-  const int p1 = blockIdx.x*16 + ty;
-  const int p2 = blockIdx.y*16 + tx;
-  const float *pt1 = &siftPoints1[ty*128];
-  const float *pt2 = &siftPoints2[tx*128];
-  float sum = 0.0f;
-  for (int i=0;i<128;i++) {
-    int itx = (i + tx)&127; // avoid bank conflicts
-    sum += pt1[itx]*pt2[itx];
-  }
-  if (p1<numPts1)
-    corrData[p1*gridDim.y*16 + p2] = (p2<numPts2 ? sum : -1.0f);
-}
-
-__global__ void FindMaxCorr(float *corrData, SiftPoint *sift1, SiftPoint *sift2, int numPts1, int corrWidth, int siftSize)
-{
-  __shared__ float maxScore[16*16];
-  __shared__ float maxScor2[16*16];
-  __shared__ int maxIndex[16*16];
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int idx = ty*16 + tx;
-  int p1 = blockIdx.x*16 + threadIdx.y;
-  p1 = (p1>=numPts1 ? numPts1-1 : p1);
-  maxScore[idx] = -1.0f;
-  maxScor2[idx] = -1.0f;
-  maxIndex[idx] = -1;
-  __syncthreads();
-  float *corrs = &corrData[p1*corrWidth];
-  for (int i=tx;i<corrWidth;i+=16) {
-    float val = corrs[i];
-    if (val>maxScore[idx]) {
-      maxScor2[idx] = maxScore[idx];
-      maxScore[idx] = val;
-      maxIndex[idx] = i;
-    } else if (val>maxScor2[idx])
-      maxScor2[idx] = val;
-  }
-  __syncthreads();
-  for (int len=8;len>0;len/=2) {
-    if (tx<8) {
-      float val = maxScore[idx+len];
-      int i = maxIndex[idx+len];
-      if (val>maxScore[idx]) {
-	maxScor2[idx] = maxScore[idx];
-	maxScore[idx] = val;
-	maxIndex[idx] = i;
-      } else if (val>maxScor2[idx])
-	maxScor2[idx] = val;
-      float va2 = maxScor2[idx+len];
-      if (va2>maxScor2[idx])
-	maxScor2[idx] = va2;
-    }
-    __syncthreads();
-  }
-  if (tx==0) {
-    sift1[p1].score = maxScore[ty*16];
-    sift1[p1].ambiguity = maxScor2[ty*16] / (maxScore[ty*16] + 1e-6);
-    sift1[p1].match = maxIndex[ty*16];
-    sift1[p1].match_xpos = sift2[maxIndex[ty*16]].xpos;
-    sift1[p1].match_ypos = sift2[maxIndex[ty*16]].ypos;
-  }
-}
-
-// Version based on suggestion by Nicholas Lin
-__global__ void FindMaxCorr3(float *corrData, SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2)
-{
-  int block_dim = blockDim.x; // blockDim.x == 16
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int p1 = blockIdx.x * block_dim + ty;
-  const int idx = ty * 16 + tx;
-  
-  __shared__ int maxIndex[16 * 16];
-  maxIndex[idx] = 0;
-  __syncthreads();
-  
-  float *corrs = NULL;
-  if (p1 < numPts1) {
-    corrs = &corrData[p1 * block_dim * 2];
-    corrs[tx] = 0.0f;
-    corrs[tx + 16] = 0.0f;
-    const float *pt1 = sift1[p1].data;
-    for (int p2 = tx; p2 < numPts2; p2 += 16) {
-      float *pt2 = sift2[p2].data;
-      float sum = 0.0f;
-      for (int i = 0; i < 128; i++) 
-	sum += pt1[i] * pt2[i];
-      if (sum > corrs[tx]) {
-	corrs[tx + 16] = corrs[tx];
-	corrs[tx] = sum;
-	maxIndex[idx] = p2;
-      }
-      else if (sum > corrs[tx + 16])
-	corrs[tx + 16] = sum;
-    }
-  }
-  __syncthreads();
-  if (p1 < numPts1) {
-    for (int len = 8; len > 0; len /= 2) {
-      if (tx < len) {
-	float val = corrs[tx + len];
-	int i = maxIndex[idx + len];
-	if (val > corrs[tx]) {
-	  corrs[tx + 16] = corrs[tx];
-	  corrs[tx] = val;
-	  maxIndex[idx] = i;
-	}
-	else if (val > corrs[tx + 16])
-	  corrs[tx + 16] = val;
-	float va2 = corrs[tx + 16 + len];
-	if (va2 > corrs[tx + 16])
-	  corrs[tx + 16] = va2;
-      }
-      __syncthreads();
-    }
-    if (tx==0) {
-      sift1[p1].score = corrs[0];
-      sift1[p1].ambiguity = corrs[16] / (corrs[0] + 1e-6);
-      sift1[p1].match = maxIndex[ty << 4];
-      sift1[p1].match_xpos = sift2[maxIndex[ty << 4]].xpos;
-      sift1[p1].match_ypos = sift2[maxIndex[ty << 4]].ypos;
-    }
-  }
-}
-
-#define FMC2W 16
-#define FMC2H 4
-
-__global__ void FindMaxCorr2(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2)
-{
-  __shared__ float siftPoint[128];
-  __shared__ float maxScore[FMC2H]; 
-  __shared__ float maxScor2[FMC2H]; 
-  __shared__ int maxIndex[FMC2H]; 
-  const int p1 = blockIdx.x;
-  if (p1>=numPts1)
-    return;
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int idx = ty*FMC2W + tx;
-  if (idx<FMC2H) {
-    maxScore[idx] = -1.0f;
-    maxScor2[idx] = -1.0f;
-    maxIndex[idx] = 0;
-  }
-  __syncthreads();
-  const float *pt1 = sift1[p1].data;
-  for (int i=idx;i<128;i+=FMC2W*FMC2H)
-    siftPoint[i] = pt1[i];
-  __syncthreads();
-  for (int p2=ty;p2<numPts2;p2+=FMC2H) {
-    const float *pt2 = sift2[p2].data;
-    float sum = 0.0f;
-    for (int j=tx;j<128;j+=FMC2W)
-      sum += siftPoint[j] * pt2[j];
-    for (int j=FMC2W/2;j>0;j/=2)
-      sum += ShiftDown(sum, j);
-    if (tx==0) {
-      if (sum>maxScore[ty]) {
-	maxScor2[ty] = maxScore[ty];
-	maxScore[ty] = sum;
-	maxIndex[ty] = p2;
-      } else if (sum>maxScor2[ty])
-	maxScor2[ty] = sum;
-    }
-  }
-  __syncthreads();
-  for (int len=FMC2H/2;len>0;len/=2) {
-    if (ty==0 && tx<len) {
-      float val = maxScore[tx+len];
-      int p2 = maxIndex[tx+len];
-      if (val>maxScore[tx]) {
-	maxScor2[tx] = maxScore[tx];
-	maxScore[tx] = val;
-	maxIndex[tx] = p2;
-      } else if (val>maxScor2[tx])
-	maxScor2[tx] = val;
-      float va2 = maxScor2[tx+len];
-      if (va2>maxScor2[tx])
-	maxScor2[tx] = va2;
-    }
-    __syncthreads();
-  }
-  if (ty==0 && tx==0) {
-    sift1[p1].score = maxScore[0];
-    sift1[p1].ambiguity = maxScor2[0] / (maxScore[0] + 1e-6);
-    sift1[p1].match = maxIndex[0];
-    sift1[p1].match_xpos = sift2[maxIndex[0]].xpos;
-    sift1[p1].match_ypos = sift2[maxIndex[0]].ypos;
-  }
-}
-
-__global__ void FindMaxCorr4(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2)
-{
-  __shared__ float siftPoint[128*FMC2H];
-  __shared__ float maxScore[FMC2H]; 
-  __shared__ float maxScor2[FMC2H]; 
-  __shared__ int maxIndex[FMC2H]; 
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  if (tx==0) {
-    maxScore[ty] = -1.0f;
-    maxScor2[ty] = -1.0f;
-    maxIndex[ty] = 0;
-  }
-  const int p1 = blockIdx.x*FMC2H + ty;
-  const float *pt1 = sift1[p1].data;
-  for (int j=tx;j<128;j+=FMC2W)
-    siftPoint[128*ty + j] = pt1[j];
-  __syncthreads();
-  for (int p2=0;p2<numPts2;p2++) {
-    const float *pt2 = sift2[p2].data;
-    float sum = 0.0f;
-    for (int j=tx;j<128;j+=FMC2W)
-      sum += siftPoint[128*ty + j] * pt2[j]; 
-    for (int j=FMC2W/2;j>0;j/=2)
-      sum += ShiftDown(sum, j);
-    if (tx==0) {
-      if (sum>maxScore[ty]) {
-	maxScor2[ty] = maxScore[ty];
-	maxScore[ty] = sum;
-	maxIndex[ty] = p2;
-      } else if (sum>maxScor2[ty])
-	maxScor2[ty] = sum;
-    }
-  }
-  __syncthreads();
-  if (tx==0) {
-    sift1[p1].score = maxScore[ty];
-    sift1[p1].ambiguity = maxScor2[ty] / (maxScore[ty] + 1e-6);
-    sift1[p1].match = maxIndex[ty];
-    sift1[p1].match_xpos = sift2[maxIndex[ty]].xpos;
-    sift1[p1].match_ypos = sift2[maxIndex[ty]].ypos;
-  }
-}
-
-
-__global__ void CleanMatches(SiftPoint *sift1, int numPts1)
-{
-  const int p1 = min(blockIdx.x*64 + threadIdx.x, numPts1-1);
-  sift1[p1].score = 0.0f;
-}
-
-#define M7W   32
-#define M7H   32
-#define M7R    4
-#define NRX    2
-#define NDIM 128
-
-__global__ void FindMaxCorr10(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2)
-{
-  __shared__ float4 buffer1[M7W*NDIM/4]; 
-  __shared__ float4 buffer2[M7H*NDIM/4];       
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int bp1 = M7W*blockIdx.x;
-  for (int j=ty;j<M7W;j+=M7H/M7R) {    
-    int p1 = min(bp1 + j, numPts1 - 1);
-    for (int d=tx;d<NDIM/4;d+=M7W)
-      buffer1[j*NDIM/4 + (d + j)%(NDIM/4)] = ((float4*)&sift1[p1].data)[d];
-  }
-      
-  float max_score[NRX];
-  float sec_score[NRX];
-  int index[NRX];
-  for (int i=0;i<NRX;i++) {
-    max_score[i] = 0.0f;
-    sec_score[i] = 0.0f;
-    index[i] = -1;
-  }
-  int idx = ty*M7W + tx;
-  int ix = idx%(M7W/NRX);
-  int iy = idx/(M7W/NRX);
-  for (int bp2=0;bp2<numPts2 - M7H + 1;bp2+=M7H) {
-    for (int j=ty;j<M7H;j+=M7H/M7R) {      
-      int p2 = min(bp2 + j, numPts2 - 1);
-      for (int d=tx;d<NDIM/4;d+=M7W)
-	buffer2[j*NDIM/4 + d] = ((float4*)&sift2[p2].data)[d];
-    }
-    __syncthreads();
-
-    if (idx<M7W*M7H/M7R/NRX) {
-      float score[M7R][NRX];                                    
-      for (int dy=0;dy<M7R;dy++)
-	for (int i=0;i<NRX;i++)
-	  score[dy][i] = 0.0f;
-      for (int d=0;d<NDIM/4;d++) {
-	float4 v1[NRX];
-	for (int i=0;i<NRX;i++) 
-	  v1[i] = buffer1[((M7W/NRX)*i + ix)*NDIM/4 + (d + (M7W/NRX)*i + ix)%(NDIM/4)];
-	for (int dy=0;dy<M7R;dy++) {
-	  float4 v2 = buffer2[(M7R*iy + dy)*(NDIM/4) + d];    
-	  for (int i=0;i<NRX;i++) {
-	    score[dy][i] += v1[i].x*v2.x;
-	    score[dy][i] += v1[i].y*v2.y;
-	    score[dy][i] += v1[i].z*v2.z;
-	    score[dy][i] += v1[i].w*v2.w;
-	  }
-	}
-      }
-      for (int dy=0;dy<M7R;dy++) {
-	for (int i=0;i<NRX;i++) {
-	  if (score[dy][i]>max_score[i]) {
-	    sec_score[i] = max_score[i];
-	    max_score[i] = score[dy][i];     
-	    index[i] = min(bp2 + M7R*iy + dy, numPts2-1);
-	  } else if (score[dy][i]>sec_score[i])
-	    sec_score[i] = score[dy][i]; 
-	}
-      }
-    }
-    __syncthreads();
-  }
-
-  float *scores1 = (float*)buffer1;
-  float *scores2 = &scores1[M7W*M7H/M7R];
-  int *indices = (int*)&scores2[M7W*M7H/M7R];
-  if (idx<M7W*M7H/M7R/NRX) {
-    for (int i=0;i<NRX;i++) {
-      scores1[iy*M7W + (M7W/NRX)*i + ix] = max_score[i];  
-      scores2[iy*M7W + (M7W/NRX)*i + ix] = sec_score[i];  
-      indices[iy*M7W + (M7W/NRX)*i + ix] = index[i];
-    }
-  }
-  __syncthreads();
-  
-  if (ty==0) {
-    float max_score = scores1[tx];
-    float sec_score = scores2[tx];
-    int index = indices[tx];
-    for (int y=0;y<M7H/M7R;y++)
-      if (index != indices[y*M7W + tx]) {
-	if (scores1[y*M7W + tx]>max_score) {
-	  sec_score = max(max_score, sec_score);
-	  max_score = scores1[y*M7W + tx]; 
-	  index = indices[y*M7W + tx];
-	} else if (scores1[y*M7W + tx]>sec_score)
-	  sec_score = scores1[y*M7W + tx];
-      }
-    sift1[bp1 + tx].score = max_score;
-    sift1[bp1 + tx].match = index;
-    sift1[bp1 + tx].match_xpos = sift2[index].xpos;
-    sift1[bp1 + tx].match_ypos = sift2[index].ypos;
-    sift1[bp1 + tx].ambiguity = sec_score / (max_score + 1e-6f);
-  }
-}
-  
-#define FMC_GH  512
-#define FMC_BW   32
-#define FMC_BH   32
-#define FMC_BD   16
-#define FMC_TW    1
-#define FMC_TH    4
-#define FMC_NW   (FMC_BW/FMC_TW)   //  32
-#define FMC_NH   (FMC_BH/FMC_TH)   //   8
-#define FMC_NT   (FMC_NW*FMC_NH)   // 256 = 8 warps
-
-__device__ volatile int lock = 0;
-
-__global__ void FindMaxCorr9(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2)
-{
-  __shared__ float4 siftParts1[FMC_BW*FMC_BD]; // 4*32*8 = 1024
-  __shared__ float4 siftParts2[FMC_BH*FMC_BD]; // 4*32*8 = 1024
-  //__shared__ float blksums[FMC_BW*FMC_BH];     // 32*32  = 1024
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int idx = ty*FMC_NW + tx;
-  float4 *pts1 = 0, *pts2 = 0;
-  if (idx<FMC_BW) {
-    const int p1l = min(blockIdx.x*FMC_BW + idx, numPts1-1);
-    pts1 = (float4*)sift1[p1l].data;
-  }
-  float maxScore = -1.0f;
-  float maxScor2 = -1.0f;
-  int maxIndex = 0;
-  for (int k=0;k<min(FMC_GH, numPts2 - FMC_BH + 1);k+=FMC_BH) {
-    if (idx<FMC_BH) {
-      const int p2l = min(blockIdx.y*FMC_GH + k + idx, numPts2-1);
-      pts2 = (float4*)sift2[p2l].data;
-    }
-    float sums[FMC_TW*FMC_TH];
-    for (int i=0;i<FMC_TW*FMC_TH;i++) 
-      sums[i] = 0.0f;
-
-    if (idx<FMC_BW)
-      for (int i=0;i<FMC_BD/2;i++) 
-	siftParts1[(i + 0)*FMC_BW + idx] = pts1[0 + i];
-    if (idx<FMC_BH)
-      for (int i=0;i<FMC_BD/2;i++) 
-	siftParts2[(i + 0)*FMC_BH + idx] = pts2[0 + i];
-    __syncthreads();
-    
-    int b = FMC_BD/2;
-    for (int d=FMC_BD/2;d<32;d+=FMC_BD/2) {
-      if (idx<FMC_BW)
-	for (int i=0;i<FMC_BD/2;i++) 
-	  siftParts1[(i + b)*FMC_BW + idx] = pts1[d + i];
-      if (idx<FMC_BH)
-	for (int i=0;i<FMC_BD/2;i++) 
-	  siftParts2[(i + b)*FMC_BH + idx] = pts2[d + i];
-
-      b ^= FMC_BD/2;
-      for (int i=0;i<FMC_BD/2;i++) {
-	float4 v1[FMC_TW];
-	for (int ix=0;ix<FMC_TW;ix++)
-	  v1[ix] = siftParts1[(i + b)*FMC_BW + (tx*FMC_TW + ix)];
-	for (int iy=0;iy<FMC_TH;iy++) {
-	  float4 v2 = siftParts2[(i + b)*FMC_BH + (ty*FMC_TH + iy)];
-	  for (int ix=0;ix<FMC_TW;ix++) {
-	    sums[iy*FMC_TW + ix] += v1[ix].x * v2.x;
-	    sums[iy*FMC_TW + ix] += v1[ix].y * v2.y;
-	    sums[iy*FMC_TW + ix] += v1[ix].z * v2.z;
-	    sums[iy*FMC_TW + ix] += v1[ix].w * v2.w;
-	  }
-	}
-      }
-      __syncthreads();
-    }
-    
-    b ^= FMC_BD/2;
-    for (int i=0;i<FMC_BD/2;i++) {
-      float4 v1[FMC_TW];
-      for (int ix=0;ix<FMC_TW;ix++)
-	v1[ix] = siftParts1[(i + b)*FMC_BW + (tx*FMC_TW + ix)];
-      for (int iy=0;iy<FMC_TH;iy++) {
-	float4 v2 = siftParts2[(i + b)*FMC_BH + (ty*FMC_TH + iy)];
-	for (int ix=0;ix<FMC_TW;ix++) {
-	  sums[iy*FMC_TW + ix] += v1[ix].x * v2.x;
-	  sums[iy*FMC_TW + ix] += v1[ix].y * v2.y;
-	  sums[iy*FMC_TW + ix] += v1[ix].z * v2.z;
-	  sums[iy*FMC_TW + ix] += v1[ix].w * v2.w;
-	}
-      }
-    }
-    __syncthreads();
-    
-    float *blksums = (float*)siftParts1;
-    for (int iy=0;iy<FMC_TH;iy++) 
-      for (int ix=0;ix<FMC_TW;ix++) 
-	blksums[(ty*FMC_TH + iy)*FMC_BW + (tx*FMC_TW + ix)] = sums[iy*FMC_TW + ix];
-    __syncthreads();
-    if (idx<FMC_BW) { 
-      for (int j=0;j<FMC_BH;j++) {
-	float sum = blksums[j*FMC_BW + idx];
-	if (sum>maxScore) { 
-	  maxScor2 = maxScore;
-	  maxScore = sum;
-	  maxIndex = min(blockIdx.y*FMC_GH + k + j, numPts2-1);
-	} else if (sum>maxScor2)
-	  maxScor2 = sum;
-      }
-    }
-    __syncthreads();
-  }
-  const int p1 = min(blockIdx.x*FMC_BW + idx, numPts1-1);
-  if (idx==0)
-    while (atomicCAS((int *)&lock, 0, 1) != 0);
-  __syncthreads();
-  if (idx<FMC_BW) {
-    float maxScor2Old = sift1[p1].ambiguity*(sift1[p1].score + 1e-6f);
-    if (maxScore>sift1[p1].score) {
-      maxScor2 = max(sift1[p1].score, maxScor2);
-      sift1[p1].ambiguity = maxScor2 / (maxScore + 1e-6f);
-      sift1[p1].score = maxScore;
-      sift1[p1].match = maxIndex;
-      sift1[p1].match_xpos = sift2[maxIndex].xpos;
-      sift1[p1].match_ypos = sift2[maxIndex].ypos;
-    } else if (maxScore>maxScor2Old)
-      sift1[p1].ambiguity = maxScore / (sift1[p1].score + 1e-6f);
-  }
-  __syncthreads();
-  if (idx==0)
-    atomicExch((int* )&lock, 0);
-}
-
-__global__ void FindMaxCorr8(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2)
-{
-  __shared__ float4 siftParts1[FMC_BW*FMC_BD]; // 4*32*8 = 1024
-  __shared__ float4 siftParts2[FMC_BH*FMC_BD]; // 4*32*8 = 1024
-  __shared__ float blksums[FMC_BW*FMC_BH];     // 32*32  = 1024
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int idx = ty*FMC_NW + tx;
-  float4 *pts1 = 0, *pts2 = 0;
-  if (idx<FMC_BW) {
-    const int p1l = min(blockIdx.x*FMC_BW + idx, numPts1-1);
-    pts1 = (float4*)sift1[p1l].data;
-  }
-  float maxScore = -1.0f;
-  float maxScor2 = -1.0f;
-  int maxIndex = 0;
-  for (int k=0;k<min(FMC_GH, numPts2 - FMC_BH + 1);k+=FMC_BH) {
-    if (idx<FMC_BH) {
-      const int p2l = min(blockIdx.y*FMC_GH + k + idx, numPts2-1);
-      pts2 = (float4*)sift2[p2l].data;
-    }
-    float sums[FMC_TW*FMC_TH];
-    for (int i=0;i<FMC_TW*FMC_TH;i++) 
-      sums[i] = 0.0f;
-    for (int d=0;d<32;d+=FMC_BD) {
-      if (idx<FMC_BW)
-	for (int i=0;i<FMC_BD;i++) 
-	  siftParts1[i*FMC_BW + idx] = pts1[d + i];
-      if (idx<FMC_BH)
-	for (int i=0;i<FMC_BD;i++) 
-	  siftParts2[i*FMC_BH + idx] = pts2[d + i];
-      __syncthreads();
-      
-      for (int i=0;i<FMC_BD;i++) {
-	float4 v1[FMC_TW];
-	for (int ix=0;ix<FMC_TW;ix++)
-	  v1[ix] = siftParts1[i*FMC_BW + (tx*FMC_TW + ix)];
-	for (int iy=0;iy<FMC_TH;iy++) {
-	  float4 v2 = siftParts2[i*FMC_BH + (ty*FMC_TH + iy)];
-	  for (int ix=0;ix<FMC_TW;ix++) {
-	    sums[iy*FMC_TW + ix] += v1[ix].x * v2.x;
-	    sums[iy*FMC_TW + ix] += v1[ix].y * v2.y;
-	    sums[iy*FMC_TW + ix] += v1[ix].z * v2.z;
-	    sums[iy*FMC_TW + ix] += v1[ix].w * v2.w;
-	  }
-	}
-      }
-      __syncthreads();
-    }
-    //float *blksums = (float*)siftParts1;
-    for (int iy=0;iy<FMC_TH;iy++) 
-      for (int ix=0;ix<FMC_TW;ix++) 
-	blksums[(ty*FMC_TH + iy)*FMC_BW + (tx*FMC_TW + ix)] = sums[iy*FMC_TW + ix];
-    __syncthreads();
-    if (idx<FMC_BW) { 
-      for (int j=0;j<FMC_BH;j++) {
-	float sum = blksums[j*FMC_BW + idx];
-	if (sum>maxScore) { 
-	  maxScor2 = maxScore;
-	  maxScore = sum;
-	  maxIndex = min(blockIdx.y*FMC_GH + k + j, numPts2-1);
-	} else if (sum>maxScor2)
-	  maxScor2 = sum;
-      }
-    }
-    __syncthreads();
-  }
-  const int p1 = min(blockIdx.x*FMC_BW + idx, numPts1-1);
-  if (idx==0)
-    while (atomicCAS((int *)&lock, 0, 1) != 0);
-  __syncthreads();
-  if (idx<FMC_BW) {
-    float maxScor2Old = sift1[p1].ambiguity*(sift1[p1].score + 1e-6f);
-    if (maxScore>sift1[p1].score) {
-      maxScor2 = max(sift1[p1].score, maxScor2);
-      sift1[p1].ambiguity = maxScor2 / (maxScore + 1e-6f);
-      sift1[p1].score = maxScore;
-      sift1[p1].match = maxIndex;
-      sift1[p1].match_xpos = sift2[maxIndex].xpos;
-      sift1[p1].match_ypos = sift2[maxIndex].ypos;
-    } else if (maxScore>maxScor2Old)
-      sift1[p1].ambiguity = maxScore / (sift1[p1].score + 1e-6f);
-  }
-  __syncthreads();
-  if (idx==0)
-    atomicExch((int* )&lock, 0);
-}
-
-__global__ void FindMaxCorr7(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2)
-{
-  __shared__ float siftParts1[17*64]; // features in columns
-  __shared__ float siftParts2[16*64]; // one extra to avoid shared conflicts
-  float4 *pts1 = (float4*)siftParts1;
-  float4 *pts2 = (float4*)siftParts2;
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int p1l = min(blockIdx.x*16 + ty, numPts1-1);
-  const float4 *p1l4 = (float4*)sift1[p1l].data;
-  float maxScore = -1.0f;
-  float maxScor2 = -1.0f;
-  int maxIndex = 0;
-  for (int k=0;k<512/16;k++) {
-    const int p2l = min(blockIdx.y*512 + k*16 + ty, numPts2-1);
-    const float4 *p2l4 = (float4*)sift2[p2l].data;
-#define NUM 4
-    float sum[NUM];
-    if (ty<(16/NUM))
-      for (int l=0;l<NUM;l++)
-	sum[l] = 0.0f;
-    __syncthreads();
-    for (int i=0;i<2;i++) {
-      pts1[17*tx + ty] = p1l4[i*16 + tx];
-      pts2[16*ty + tx] = p2l4[i*16 + tx];
-      __syncthreads(); 
-      if (ty<(16/NUM)) {
-#pragma unroll
-	for (int j=0;j<16;j++) {
-	  float4 p1v = pts1[17* j + tx];
-#pragma unroll
-	  for (int l=0;l<NUM;l++) {
-	    float4 p2v = pts2[16*(ty + l*(16/NUM)) +  j];
-	    sum[l] += p1v.x * p2v.x;
-	    sum[l] += p1v.y * p2v.y;
-	    sum[l] += p1v.z * p2v.z;
-	    sum[l] += p1v.w * p2v.w;
-	  }
-	}
-      }
-      __syncthreads();
-    }
-    float *sums = siftParts1;
-    if (ty<(16/NUM))
-      for (int l=0;l<NUM;l++) 
-	sums[16*(ty + l*(16/NUM)) + tx] = sum[l];
-    __syncthreads();
-    if (ty==0) { 
-      for (int j=0;j<16;j++) {
-	float sum = sums[16*j + tx];
-	if (sum>maxScore) { 
-	  maxScor2 = maxScore;
-	  maxScore = sum;
-	  maxIndex = min(blockIdx.y*512 +  k*16 + j, numPts2-1);
-	} else if (sum>maxScor2)
-	  maxScor2 = sum;
-      }
-    }
-    __syncthreads();
-  }
-  const int p1 = min(blockIdx.x*16 + tx, numPts1-1);
-  if (tx==0 && ty==0)
-    while (atomicCAS((int *)&lock, 0, 1) != 0);
-  __syncthreads();
-  if (ty==0) {
-    float maxScor2Old = sift1[p1].ambiguity*(sift1[p1].score + 1e-6f);
-    if (maxScore>sift1[p1].score) {
-      maxScor2 = max(sift1[p1].score, maxScor2);
-      sift1[p1].ambiguity = maxScor2 / (maxScore + 1e-6f);
-      sift1[p1].score = maxScore;
-      sift1[p1].match = maxIndex;
-      sift1[p1].match_xpos = sift2[maxIndex].xpos;
-      sift1[p1].match_ypos = sift2[maxIndex].ypos;
-    } else if (maxScore>maxScor2Old)
-      sift1[p1].ambiguity = maxScore / (sift1[p1].score + 1e-6f);
-  }
-  __syncthreads();
-  if (tx==0 && ty==0)
-    atomicExch((int* )&lock, 0);
-}
-
-__global__ void FindMaxCorr6(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2)
-{
-  //__shared__ float siftParts1[128*16]; // features in columns
-  __shared__ float siftParts2[128*16]; // one extra to avoid shared conflicts
-  __shared__ float sums[16*16];
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int p1l = min(blockIdx.x*16 + ty, numPts1-1);
-  float *pt1l = sift1[p1l].data;
-  float4 part1 = reinterpret_cast<float4*>(pt1l)[tx];
-  float maxScore = -1.0f;
-  float maxScor2 = -1.0f;
-  int maxIndex = 0;
-  for (int k=0;k<512;k+=16) {
-    const int p2l = min(blockIdx.y*512 + k + ty, numPts2-1);
-    float *pt2l = sift2[p2l].data;
-    reinterpret_cast<float4*>(siftParts2)[32*ty + tx] = reinterpret_cast<float4*>(pt2l)[tx];
-    __syncthreads();
-    for (int i=0;i<16;i++) {
-      float4 part2 = reinterpret_cast<float4*>(siftParts2)[32*i  + tx];
-      float sum = part1.x*part2.x + part1.y*part2.y + part1.z*part2.z + part1.w*part2.w;
-      sum += ShiftDown(sum, 16);
-      sum += ShiftDown(sum, 8);
-      sum += ShiftDown(sum, 4);
-      sum += ShiftDown(sum, 2);
-      sum += ShiftDown(sum, 1);
-      if (tx==0)
-	sums[16*i + ty] = sum;
-    }
-    __syncthreads();
-    if (ty==0 && tx<16) { 
-      for (int j=0;j<16;j++) {
-	float sum = sums[16*j + tx];
-	if (sum>maxScore) { 
-	  maxScor2 = maxScore;
-	  maxScore = sum;
-	  maxIndex = min(blockIdx.y*512 +  k + j, numPts2-1);
-	} else if (sum>maxScor2)
-	  maxScor2 = sum;
-      }
-    }
-    __syncthreads();
-  }
-  if (tx==0 && ty==0)
-    while (atomicCAS((int *)&lock, 0, 1) != 0);
-  __syncthreads();
-  if (ty==0 && tx<16) {
-    const int p1 = min(blockIdx.x*16 + tx, numPts1-1);
-    float maxScor2Old = sift1[p1].ambiguity*(sift1[p1].score + 1e-6f);
-    if (maxScore>sift1[p1].score) {
-      maxScor2 = max(sift1[p1].score, maxScor2);
-      sift1[p1].ambiguity = maxScor2 / (maxScore + 1e-6f);
-      sift1[p1].score = maxScore;
-      sift1[p1].match = maxIndex;
-      sift1[p1].match_xpos = sift2[maxIndex].xpos;
-      sift1[p1].match_ypos = sift2[maxIndex].ypos;
-    } else if (maxScore>maxScor2Old)
-      sift1[p1].ambiguity = maxScore / (sift1[p1].score + 1e-6f);
-  }
-  __syncthreads();
-  if (tx==0 && ty==0)
-    atomicExch((int* )&lock, 0);
-}
- 
-__global__ void FindMaxCorr5(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2)
-{
-  __shared__ float siftParts1[17*16]; // features in columns
-  __shared__ float siftParts2[17*16]; // one extra to avoid shared conflicts
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int p1l = min(blockIdx.x*16 + ty, numPts1-1);
-  const float *pt1l = sift1[p1l].data;
-  float maxScore = -1.0f;
-  float maxScor2 = -1.0f;
-  int maxIndex = 0;
-  for (int k=0;k<512/16;k++) {
-    const int p2l = min(blockIdx.y*512 + k*16 + ty, numPts2-1);
-    const float *pt2l = sift2[p2l].data;
-    float sum = 0.0f;
-    for (int i=0;i<8;i++) {
-      siftParts1[17*tx + ty] = pt1l[i*16 + tx]; // load and transpose
-      siftParts2[17*tx + ty] = pt2l[i*16 + tx];
-      __syncthreads();
-      for (int j=0;j<16;j++)
-	sum += siftParts1[17*j + tx] * siftParts2[17*j + ty];
-      __syncthreads();
-    }
-    float *sums = siftParts1;
-    sums[16*ty + tx] = sum;
-    __syncthreads();
-    if (ty==0) { 
-      for (int j=0;j<16;j++) {
-	float sum = sums[16*j + tx];
-	if (sum>maxScore) { 
-	  maxScor2 = maxScore;
-	  maxScore = sum;
-	  maxIndex = min(blockIdx.y*512 +  k*16 + j, numPts2-1);
-	} else if (sum>maxScor2)
-	  maxScor2 = sum;
-      }
-    }
-    __syncthreads();
-  }
-  const int p1 = min(blockIdx.x*16 + tx, numPts1-1);
-  if (tx==0 && ty==0)
-    while (atomicCAS((int *)&lock, 0, 1) != 0);
-  __syncthreads();
-  if (ty==0) {
-    float maxScor2Old = sift1[p1].ambiguity*(sift1[p1].score + 1e-6f);
-    if (maxScore>sift1[p1].score) {
-      maxScor2 = max(sift1[p1].score, maxScor2);
-      sift1[p1].ambiguity = maxScor2 / (maxScore + 1e-6f);
-      sift1[p1].score = maxScore;
-      sift1[p1].match = maxIndex;
-      sift1[p1].match_xpos = sift2[maxIndex].xpos;
-      sift1[p1].match_ypos = sift2[maxIndex].ypos;
-    } else if (maxScore>maxScor2Old)
-      sift1[p1].ambiguity = maxScore / (sift1[p1].score + 1e-6f);
-  }
-  __syncthreads();
-  if (tx==0 && ty==0)
-    atomicExch((int* )&lock, 0);
-}
- 
-
-template <int size>
-__device__ void InvertMatrix(float elem[size][size], float res[size][size]) 
-{  
-  int indx[size];
-  float b[size];
-  float vv[size];
-  for (int i=0;i<size;i++)
-    indx[i] = 0;
-  int imax = 0;
-  float d = 1.0;
-  for (int i=0;i<size;i++) { // find biggest element for each row
-    float big = 0.0;
-    for (int j=0;j<size;j++) {
-      float temp = fabs(elem[i][j]); 
-      if (temp>big) 
-	big = temp;
-    }
-    if (big>0.0)
-      vv[i] = 1.0/big;
-    else
-      vv[i] = 1e16;
-  }
-  for (int j=0;j<size;j++) { 
-    for (int i=0;i<j;i++) { // i<j
-      float sum = elem[i][j]; // i<j (lower left)
-      for (int k=0;k<i;k++) // k<i<j
-	sum -= elem[i][k]*elem[k][j]; // i>k (upper right), k<j (lower left)
-      elem[i][j] = sum; // i<j (lower left)
-    }
-    float big = 0.0;
-    for (int i=j;i<size;i++) { // i>=j
-      float sum = elem[i][j]; // i>=j (upper right)
-      for (int k=0;k<j;k++) // k<j<=i
-	sum -= elem[i][k]*elem[k][j]; // i>k (upper right), k<j (lower left)
-      elem[i][j] = sum; // i>=j (upper right)
-      float dum = vv[i]*fabs(sum);
-      if (dum>=big) {
-	big = dum;
-	imax = i;  
-      }
-    }
-    if (j!=imax) { // imax>j
-      for (int k=0;k<size;k++) {
-	float dum = elem[imax][k]; // upper right and lower left
-	elem[imax][k] = elem[j][k];
-	elem[j][k] = dum;
-      }
-      d = -d;
-      vv[imax] = vv[j];
-    }
-    indx[j] = imax;
-    if (elem[j][j]==0.0)  // j==j (upper right)
-      elem[j][j] = 1e-16;
-    if (j!=(size-1)) {
-      float dum = 1.0/elem[j][j];
-      for (int i=j+1;i<size;i++) // i>j
-	elem[i][j] *= dum; // i>j (upper right)
-    }
-  }
-  for (int j=0;j<size;j++) {
-    for (int k=0;k<size;k++) 
-      b[k] = 0.0;  
-    b[j] = 1.0;
-    int ii = -1;
-    for (int i=0;i<size;i++) {
-      int ip = indx[i];
-      float sum = b[ip];
-      b[ip] = b[i];
-      if (ii!=-1)
-	for (int j=ii;j<i;j++) 
-	  sum -= elem[i][j]*b[j]; // i>j (upper right)
-      else if (sum!=0.0)
-        ii = i;
-      b[i] = sum;
-    }
-    for (int i=size-1;i>=0;i--) {
-      float sum = b[i];
-      for (int j=i+1;j<size;j++) 
-	sum -= elem[i][j]*b[j]; // i<j (lower left)
-      b[i] = sum/elem[i][i]; // i==i (upper right)
-    }
-    for (int i=0;i<size;i++)
-      res[i][j] = b[i];
-  }
-}
-
-__global__ void ComputeHomographies(float *coord, int *randPts, float *homo, 
-  int numPts) 
-{
-  float a[8][8], ia[8][8];
-  float b[8]; 
-  const int bx = blockIdx.x;
-  const int tx = threadIdx.x;
-  const int idx = blockDim.x*bx + tx;
-  const int numLoops = blockDim.x*gridDim.x;
-  for (int i=0;i<4;i++) {
-    int pt = randPts[i*numLoops+idx];
-    float x1 = coord[pt+0*numPts];
-    float y1 = coord[pt+1*numPts];
-    float x2 = coord[pt+2*numPts];
-    float y2 = coord[pt+3*numPts];
-    float *row1 = a[2*i+0];
-    row1[0] = x1;
-    row1[1] = y1;
-    row1[2] = 1.0;
-    row1[3] = row1[4] = row1[5] = 0.0;
-    row1[6] = -x2*x1;
-    row1[7] = -x2*y1;
-    float *row2 = a[2*i+1];
-    row2[0] = row2[1] = row2[2] = 0.0;
-    row2[3] = x1;
-    row2[4] = y1;
-    row2[5] = 1.0;
-    row2[6] = -y2*x1;
-    row2[7] = -y2*y1;
-    b[2*i+0] = x2;
-    b[2*i+1] = y2;
-  }
-  InvertMatrix<8>(a, ia);
-  __syncthreads();
-  for (int j=0;j<8;j++) {
-    float sum = 0.0f;
-    for (int i=0;i<8;i++) 
-      sum += ia[j][i]*b[i];
-    homo[j*numLoops+idx] = sum;
-  }
-  __syncthreads();
-}
-
-#define TESTHOMO_TESTS 16 // number of tests per block,  alt. 32, 32
-#define TESTHOMO_LOOPS 16 // number of loops per block,  alt.  8, 16 
-
-__global__ void TestHomographies(float *d_coord, float *d_homo, 
-  int *d_counts, int numPts, float thresh2)
-{
-  __shared__ float homo[8*TESTHOMO_LOOPS];
-  __shared__ int cnts[TESTHOMO_TESTS*TESTHOMO_LOOPS];
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-  const int idx = blockIdx.y*blockDim.y + tx;
-  const int numLoops = blockDim.y*gridDim.y;
-  if (ty<8 && tx<TESTHOMO_LOOPS)
-    homo[tx*8+ty] = d_homo[idx+ty*numLoops];
-  __syncthreads();
-  float a[8];
-  for (int i=0;i<8;i++) 
-    a[i] = homo[ty*8+i];
-  int cnt = 0;
-  for (int i=tx;i<numPts;i+=TESTHOMO_TESTS) {
-    float x1 = d_coord[i+0*numPts];
-    float y1 = d_coord[i+1*numPts];
-    float x2 = d_coord[i+2*numPts];
-    float y2 = d_coord[i+3*numPts];
-    float nomx = __fmul_rz(a[0],x1) + __fmul_rz(a[1],y1) + a[2];
-    float nomy = __fmul_rz(a[3],x1) + __fmul_rz(a[4],y1) + a[5];
-    float deno = __fmul_rz(a[6],x1) + __fmul_rz(a[7],y1) + 1.0f;
-    float errx = __fmul_rz(x2,deno) - nomx;
-    float erry = __fmul_rz(y2,deno) - nomy;
-    float err2 = __fmul_rz(errx,errx) + __fmul_rz(erry,erry);
-    if (err2<__fmul_rz(thresh2,__fmul_rz(deno,deno)))
-      cnt ++;
-  }
-  int kty = TESTHOMO_TESTS*ty;
-  cnts[kty + tx] = cnt;
-  __syncthreads();
-  int len = TESTHOMO_TESTS/2;
-  while (len>0) {
-    if (tx<len)
-      cnts[kty + tx] += cnts[kty + tx + len];
-    len /= 2;
-    __syncthreads();
-  }
-  if (tx<TESTHOMO_LOOPS && ty==0)
-    d_counts[idx] = cnts[TESTHOMO_TESTS*tx];
-  __syncthreads();
-}
-
-//================= Host matching functions =====================//
-
-double FindHomography(SiftData &data, float *homography, int *numMatches, int numLoops, float minScore, float maxAmbiguity, float thresh)
-{
-  *numMatches = 0;
-  homography[0] = homography[4] = homography[8] = 1.0f;
-  homography[1] = homography[2] = homography[3] = 0.0f;
-  homography[5] = homography[6] = homography[7] = 0.0f;
-#ifdef MANAGEDMEM
-  SiftPoint *d_sift = data.m_data;
-#else
-  if (data.d_data==NULL)
-    return 0.0f;
-  SiftPoint *d_sift = data.d_data;
-#endif
-  TimerGPU timer(0);
-  numLoops = iDivUp(numLoops,16)*16;
-  int numPts = data.numPts;
-  if (numPts<8)
-    return 0.0f;
-  int numPtsUp = iDivUp(numPts, 16)*16;
-  float *d_coord, *d_homo;
-  int *d_randPts, *h_randPts;
-  int randSize = 4*sizeof(int)*numLoops;
-  int szFl = sizeof(float);
-  int szPt = sizeof(SiftPoint);
-  safeCall(cudaMalloc((void **)&d_coord, 4*sizeof(float)*numPtsUp));
-  safeCall(cudaMalloc((void **)&d_randPts, randSize));
-  safeCall(cudaMalloc((void **)&d_homo, 8*sizeof(float)*numLoops));
-  h_randPts = (int*)malloc(randSize);
-  float *h_scores = (float *)malloc(sizeof(float)*numPtsUp);
-  float *h_ambiguities = (float *)malloc(sizeof(float)*numPtsUp);
-  safeCall(cudaMemcpy2D(h_scores, szFl, &d_sift[0].score, szPt, szFl, numPts, cudaMemcpyDeviceToHost));
-  safeCall(cudaMemcpy2D(h_ambiguities, szFl, &d_sift[0].ambiguity, szPt, szFl, numPts, cudaMemcpyDeviceToHost));
-  int *validPts = (int *)malloc(sizeof(int)*numPts);
-  int numValid = 0;
-  for (int i=0;i<numPts;i++) {
-    if (h_scores[i]>minScore && h_ambiguities[i]<maxAmbiguity)
-      validPts[numValid++] = i;
-  }
-  free(h_scores);
-  free(h_ambiguities);
-  if (numValid>=8) {
-    for (int i=0;i<numLoops;i++) {
-      int p1 = rand() % numValid;
-      int p2 = rand() % numValid;
-      int p3 = rand() % numValid;
-      int p4 = rand() % numValid;
-      while (p2==p1) p2 = rand() % numValid;
-      while (p3==p1 || p3==p2) p3 = rand() % numValid;
-      while (p4==p1 || p4==p2 || p4==p3) p4 = rand() % numValid;
-      h_randPts[i+0*numLoops] = validPts[p1];
-      h_randPts[i+1*numLoops] = validPts[p2];
-      h_randPts[i+2*numLoops] = validPts[p3];
-      h_randPts[i+3*numLoops] = validPts[p4];
-    }
-    safeCall(cudaMemcpy(d_randPts, h_randPts, randSize, cudaMemcpyHostToDevice));
-    safeCall(cudaMemcpy2D(&d_coord[0*numPtsUp], szFl, &d_sift[0].xpos, szPt, szFl, numPts, cudaMemcpyDeviceToDevice));
-    safeCall(cudaMemcpy2D(&d_coord[1*numPtsUp], szFl, &d_sift[0].ypos, szPt, szFl, numPts, cudaMemcpyDeviceToDevice));
-    safeCall(cudaMemcpy2D(&d_coord[2*numPtsUp], szFl, &d_sift[0].match_xpos, szPt, szFl, numPts, cudaMemcpyDeviceToDevice));
-    safeCall(cudaMemcpy2D(&d_coord[3*numPtsUp], szFl, &d_sift[0].match_ypos, szPt, szFl, numPts, cudaMemcpyDeviceToDevice));
-    ComputeHomographies<<<numLoops/16, 16>>>(d_coord, d_randPts, d_homo, numPtsUp);
-    safeCall(cudaDeviceSynchronize());
-    checkMsg("ComputeHomographies() execution failed\n");
-    dim3 blocks(1, numLoops/TESTHOMO_LOOPS);
-    dim3 threads(TESTHOMO_TESTS, TESTHOMO_LOOPS);
-    TestHomographies<<<blocks, threads>>>(d_coord, d_homo, d_randPts, numPtsUp, thresh*thresh);
-    safeCall(cudaDeviceSynchronize());
-    checkMsg("TestHomographies() execution failed\n");
-    safeCall(cudaMemcpy(h_randPts, d_randPts, sizeof(int)*numLoops, cudaMemcpyDeviceToHost));
-    int maxIndex = -1, maxCount = -1;
-    for (int i=0;i<numLoops;i++) 
-      if (h_randPts[i]>maxCount) {
-	maxCount = h_randPts[i];
-	maxIndex = i;
-      }
-    *numMatches = maxCount;
-    safeCall(cudaMemcpy2D(homography, szFl, &d_homo[maxIndex], sizeof(float)*numLoops, szFl, 8, cudaMemcpyDeviceToHost));
-  }
-  free(validPts);
-  free(h_randPts);
-  safeCall(cudaFree(d_homo));
-  safeCall(cudaFree(d_randPts));
-  safeCall(cudaFree(d_coord));
-  double gpuTime = timer.read();
-#ifdef VERBOSE
-  printf("FindHomography time =         %.2f ms\n", gpuTime);
-#endif
-  return gpuTime;
-}
-
-
-double MatchSiftData(SiftData &data1, SiftData &data2)
-{
-  TimerGPU timer(0);
-  int numPts1 = data1.numPts;
-  int numPts2 = data2.numPts;
-  if (!numPts1 || !numPts2) 
-    return 0.0;
-#ifdef MANAGEDMEM
-  SiftPoint *sift1 = data1.m_data;
-  SiftPoint *sift2 = data2.m_data;
-#else
-  if (data1.d_data==NULL || data2.d_data==NULL)
-    return 0.0f;
-  SiftPoint *sift1 = data1.d_data;
-  SiftPoint *sift2 = data2.d_data;
-#endif
-  
-// Original version with correlation and maximization in two different kernels
-// Global memory reguirement: O(N^2)
-#if 0
-  float *d_corrData; 
-  int corrWidth = iDivUp(numPts2, 16)*16;
-  int corrSize = sizeof(float)*numPts1*corrWidth;
-  safeCall(cudaMalloc((void **)&d_corrData, corrSize));
-#if 0 // K40c 10.9ms, 1080 Ti 3.8ms
-  dim3 blocks1(numPts1, iDivUp(numPts2, 16));
-  dim3 threads1(16, 16); // each block: 1 points x 16 points
-  MatchSiftPoints<<<blocks1, threads1>>>(sift1, sift2, d_corrData, numPts1, numPts2);
-#else // K40c 7.6ms, 1080 Ti 1.4ms
-  dim3 blocks(iDivUp(numPts1,16), iDivUp(numPts2, 16));
-  dim3 threads(16, 16); // each block: 16 points x 16 points
-  MatchSiftPoints2<<<blocks, threads>>>(sift1, sift2, d_corrData, numPts1, numPts2);
-#endif
-  safeCall(cudaDeviceSynchronize());
-  dim3 blocksMax(iDivUp(numPts1, 16));
-  dim3 threadsMax(16, 16);
-  FindMaxCorr<<<blocksMax, threadsMax>>>(d_corrData, sift1, sift2, numPts1, corrWidth, sizeof(SiftPoint));
-  safeCall(cudaDeviceSynchronize());
-  checkMsg("FindMaxCorr() execution failed\n");
-  safeCall(cudaFree(d_corrData));
-#endif
-
-// Version suggested by Nicholas Lin with combined correlation and maximization
-// Global memory reguirement: O(N)
-#if 0 // K40c 51.2ms, 1080 Ti 9.6ms
-  int block_dim = 16;
-  float *d_corrData;
-  int corrSize = numPts1 * block_dim * 2;
-  safeCall(cudaMalloc((void **)&d_corrData, sizeof(float) * corrSize));
-  dim3 blocks(iDivUp(numPts1, block_dim));
-  dim3 threads(block_dim, block_dim); 
-  FindMaxCorr3<<<blocks, threads >>>(d_corrData, sift1, sift2, numPts1, numPts2);
-  safeCall(cudaDeviceSynchronize());
-  checkMsg("FindMaxCorr3() execution failed\n");
-  safeCall(cudaFree(d_corrData));
-#endif
-
-// Combined version with no global memory requirement using one 1 point per block
-#if 0 // K40c 8.9ms, 1080 Ti 2.1ms, 2080 Ti 1.0ms
-  dim3 blocksMax(numPts1);
-  dim3 threadsMax(FMC2W, FMC2H);
-  FindMaxCorr2<<<blocksMax, threadsMax>>>(sift1, sift2, numPts1, numPts2);
-  safeCall(cudaDeviceSynchronize());
-  checkMsg("FindMaxCorr2() execution failed\n");
-#endif
-  
-// Combined version with no global memory requirement using one FMC2H points per block
-#if 0 // K40c 9.2ms, 1080 Ti 1.3ms, 2080 Ti 1.1ms
-  dim3 blocksMax2(iDivUp(numPts1, FMC2H));
-  dim3 threadsMax2(FMC2W, FMC2H);
-  FindMaxCorr4<<<blocksMax2, threadsMax2>>>(sift1, sift2, numPts1, numPts2);
-  safeCall(cudaDeviceSynchronize());
-  checkMsg("FindMaxCorr4() execution failed\n");
-#endif
-
-// Combined version with no global memory requirement using global locks
-#if 1
-  dim3 blocksMax3(iDivUp(numPts1, 16), iDivUp(numPts2, 512));
-  dim3 threadsMax3(16, 16);
-  CleanMatches<<<iDivUp(numPts1, 64), 64>>>(sift1, numPts1);
-  int mode = 10;
-  if (mode==5)// K40c 5.0ms, 1080 Ti 1.2ms, 2080 Ti 0.83ms
-    FindMaxCorr5<<<blocksMax3, threadsMax3>>>(sift1, sift2, numPts1, numPts2);
-  else if (mode==6) {                    // 2080 Ti 0.89ms
-    threadsMax3 = dim3(32, 16);
-    FindMaxCorr6<<<blocksMax3, threadsMax3>>>(sift1, sift2, numPts1, numPts2);
-  } else if (mode==7)                    // 2080 Ti 0.50ms  
-    FindMaxCorr7<<<blocksMax3, threadsMax3>>>(sift1, sift2, numPts1, numPts2);
-  else if (mode==8) {                    // 2080 Ti 0.45ms
-    blocksMax3 = dim3(iDivUp(numPts1, FMC_BW), iDivUp(numPts2, FMC_GH));
-    threadsMax3 = dim3(FMC_NW, FMC_NH);
-    FindMaxCorr8<<<blocksMax3, threadsMax3>>>(sift1, sift2, numPts1, numPts2);
-  } else if (mode==9) {                  // 2080 Ti 0.46ms
-    blocksMax3 = dim3(iDivUp(numPts1, FMC_BW), iDivUp(numPts2, FMC_GH));
-    threadsMax3 = dim3(FMC_NW, FMC_NH);
-    FindMaxCorr9<<<blocksMax3, threadsMax3>>>(sift1, sift2, numPts1, numPts2);
-  } else if (mode==10) {                 // 2080 Ti 0.24ms
-    blocksMax3 = dim3(iDivUp(numPts1, M7W));
-    threadsMax3 = dim3(M7W, M7H/M7R);
-    FindMaxCorr10<<<blocksMax3, threadsMax3>>>(sift1, sift2, numPts1, numPts2);
-  }
-  safeCall(cudaDeviceSynchronize());
-  checkMsg("FindMaxCorr5() execution failed\n");
-#endif
-
-  if (data1.h_data!=NULL) {
-    float *h_ptr = &data1.h_data[0].score;
-    float *d_ptr = &data1.d_data[0].score;
-    safeCall(cudaMemcpy2D(h_ptr, sizeof(SiftPoint), d_ptr, sizeof(SiftPoint), 5*sizeof(float), data1.numPts, cudaMemcpyDeviceToHost));
-  }
-
-  double gpuTime = timer.read();
-#ifndef VERBOSE
-  printf("MatchSiftData time =          %.2f ms\n", gpuTime);
-#endif
-  return gpuTime;
-}		 
-  
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..af5e3a6
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,112 @@
+cmake_minimum_required(VERSION 3.10)
+project(syclsift LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)           # SYCL code requires this
+set(CMAKE_CXX_STANDARD_REQUIRED ON)  # Enable modern C++ standards
+set(CMAKE_CXX_EXTENSIONS OFF)        # Use -std, not -gnu
+
+option(GPU_AOT                 "Build AOT for Intel GPU"      OFF)
+option(USE_NVIDIA_BACKEND      "Build for NVIDIA backend"     OFF)
+option(USE_AMDHIP_BACKEND      "Build for AMD HIP backend"    OFF)
+option(USE_INTEL_CPU           "Build for INTEL CPU"    OFF) 
+option(USE_SM                  "Specifies which streaming multiprocessor architecture to use"     )
+option(OpenCV_DIR              "Path to OpenCV_DIR"     )
+option(DEVICE_TIMER            "Build using Device Timer" OFF)
+
+
+# Find OpenCV, you may need to set OpenCV_DIR variable
+# to the absolute path to the directory containing OpenCVConfig.cmake file
+# via the command line or GUI
+find_package(OpenCV REQUIRED)
+
+# If the package has been found, several variables will
+# be set, you can find the full list with descriptions
+# in the OpenCVConfig.cmake file.
+# Print some message showing some of them
+    message(STATUS "OpenCV library status:")
+    message(STATUS "    version: ${OpenCV_VERSION}")
+    message(STATUS "    libraries: ${OpenCV_LIBS}")
+    message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+if(CMAKE_VERSION VERSION_LESS "2.8.11")
+# Add OpenCV headers location to your include paths
+include_directories(${OpenCV_INCLUDE_DIRS})
+endif()
+
+
+set(SOURCES
+    ${CMAKE_SOURCE_DIR}/../common/Utility.cpp
+    cudaImage.dp.cpp
+    cudaImage.h
+    cudaSiftH.dp.cpp
+    cudaSiftH.h
+    matching.dp.cpp
+    cudaSiftD.h
+    cudaSift.h
+    geomFuncs.cpp
+    mainSift.cpp
+)
+
+include_directories(
+    ${CMAKE_SOURCE_DIR}/../common/
+    ${CMAKE_SOURCE_DIR}
+    ${OpenCV_INCLUDE_DIRS}
+)
+
+if(DEVICE_TIMER)
+    message(STATUS "Enabling Device Timer")
+    add_compile_options(-DDEVICE_TIMER)
+endif()
+
+# Use either default or user defined CXX flags
+# -DCMAKE_CXX_FLAGS=" -blah -blah " overrides the default flags
+
+set(USE_DEFAULT_FLAGS ON)
+set(INTEL_CPU_CXX_FLAGS  " -O2 -fsycl -Wall -Wextra -Wno-unused-parameter ") 
+set(INTEL_GPU_CXX_FLAGS  " -O2 -fsycl -ffast-math")
+set(NVIDIA_GPU_CXX_FLAGS " -O3 -fsycl -ffast-math ")
+set(AMD_GPU_CXX_FLAGS    " -O3 -fsycl -ffast-math ")
+
+if("${CMAKE_CXX_FLAGS}" STREQUAL "")
+    message(STATUS "Using DEFAULT compilation flags for the application")
+    string(APPEND CMAKE_CXX_FLAGS "${INTEL_GPU_CXX_FLAGS}") # Default flags for NV backend
+else()
+    message(STATUS "OVERRIDING compilation flags")
+    set(USE_DEFAULT_FLAGS OFF)
+endif()
+
+# JIT compilation 
+if(GPU_AOT) 
+    if( (${GPU_AOT} STREQUAL "pvc") OR (${GPU_AOT} STREQUAL "PVC") )
+        message(STATUS "Enabling Intel GPU AOT compilation for ${GPU_AOT}")
+        string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=spir64_gen -Xs \"-device 0x0bd5 -revision_id 0x2f\" -Xs \"-options -ze-opt-large-register-file\" ")
+    else()
+        message(STATUS "Using custom AOT compilation flag ${GPU_AOT}")
+        string(APPEND CMAKE_CXX_FLAGS " ${GPU_AOT} ") # User should be aware of advanced AOT compilation flags
+    endif()
+elseif(USE_NVIDIA_BACKEND)    
+    message(STATUS "Enabling NVIDIA backend")
+    if(USE_DEFAULT_FLAGS)
+        set(CMAKE_CXX_FLAGS "${NVIDIA_GPU_CXX_FLAGS}") # Default flags for NV backend
+    endif()
+    string(APPEND CMAKE_CXX_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_${USE_SM}") # -O3 will be used, even though -O2 was set earlier 
+elseif(USE_AMDHIP_BACKEND)
+    message(STATUS "Enabling AMD HIP backend for ${USE_AMDHIP_BACKEND} AMD architecture")
+    if(USE_DEFAULT_FLAGS)
+        set(CMAKE_CXX_FLAGS "${AMD_GPU_CXX_FLAGS}")
+    endif()
+    string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${USE_AMDHIP_BACKEND}  ")
+elseif(USE_INTEL_CPU)
+    message(STATUS "Enabling INTEL CPU backend for ${USE_CPU_BACKEND} INTEL architecture")
+    if(USE_DEFAULT_FLAGS)   
+	    set(CMAKE_CXX_FLAGS "${INTEL_CPU_CXX_FLAGS}")				    
+    endif()
+    string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=spir64_x86_64 -Xsycl-target-backend \"--march=avx512\"  ")
+    # string(APPEND CMAKE_CXX_FLAGS "  -ffast-math -mprefer-vector-width=512 -mfma -fsycl-targets=spir64_x86_64--linux  \"-device avx512\"  ")
+endif()
+
+# Output the compiler flags that were constructed for visual inspection
+message(STATUS "Compilation flags set to: ${CMAKE_CXX_FLAGS}")
+
+add_executable(${PROJECT_NAME} ${SOURCES})
+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS} stdc++ stdc++fs)
diff --git a/src/cudaImage.dp.cpp b/src/cudaImage.dp.cpp
new file mode 100644
index 0000000..25ce314
--- /dev/null
+++ b/src/cudaImage.dp.cpp
@@ -0,0 +1,112 @@
+//********************************************************//
+// CUDA SIFT extractor by Marten Bjorkman aka Celebrandil //
+//********************************************************//
+
+// Modifications Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+
+#include <sycl/sycl.hpp>
+#include <cstdio>
+
+#include "infra/memory.hpp"
+#include "cudautils.h"
+#include "cudaImage.h"
+
+int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
+int iDivDown(int a, int b) { return a / b; }
+int iAlignUp(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; }
+int iAlignDown(int a, int b) { return a - a % b; }
+
+void CudaImage::Allocate(int w, int h, int p, bool host, sycl::queue &q_ct, float &time, float *devmem, float *hostmem)
+{
+  width = w;
+  height = h;
+  pitch = p;
+  d_data = devmem;
+  h_data = hostmem;
+  if (devmem == NULL)
+  {
+#ifdef DEVICE_TIMER
+    auto start_malloc = std::chrono::steady_clock::now();
+#endif
+    d_data = (float *)infra::sift_malloc(pitch, (size_t)(sizeof(float) * width), (size_t)height, q_ct);
+    q_ct.wait();
+#ifdef DEVICE_TIMER
+    auto stop_malloc = std::chrono::steady_clock::now();
+    std::cout << "Allocate Time is " << std::chrono::duration<float, std::micro>(stop_malloc - start_malloc).count() << " us" << std::endl;
+    time += std::chrono::duration<float, std::micro>(stop_malloc - start_malloc).count();
+#endif
+
+    pitch /= sizeof(float);
+    if (d_data == NULL)
+      printf("Failed to allocate device data\n");
+    d_internalAlloc = true;
+  }
+  if (host && hostmem == NULL)
+  {
+    h_data = (float *)malloc(sizeof(float) * pitch * height);
+    h_internalAlloc = true;
+  }
+}
+
+CudaImage::CudaImage() : width(0), height(0), pitch(0), d_data(NULL), h_data(NULL), /*t_data(NULL), */ d_internalAlloc(false), h_internalAlloc(false)
+{
+}
+
+CudaImage::~CudaImage()
+{
+  if (d_internalAlloc && d_data != NULL)
+    try
+    {
+      safeCall((sycl::free(d_data, infra::get_default_queue()), 0));
+    }
+    catch (std::exception const &e)
+    {
+      std::cerr << e.what() << '\n';
+    }
+  d_data = NULL;
+  if (h_internalAlloc && h_data != NULL)
+    free(h_data);
+  h_data = NULL;
+}
+
+double CudaImage::Download(sycl::queue &q_ct, float &time)
+{
+  double downloadTime = 0.0;
+  int p = sizeof(float) * pitch;
+  if (d_data != NULL && h_data != NULL)
+  {
+#ifdef DEVICE_TIMER
+    auto start_memcpy = std::chrono::steady_clock::now();
+#endif
+    infra::sift_memcpy(d_data, p, h_data, sizeof(float) * width, sizeof(float) * width, height, infra::host_to_device, q_ct);
+    q_ct.wait();
+
+#ifdef DEVICE_TIMER
+    auto stop_memcpy = std::chrono::steady_clock::now();
+    downloadTime = std::chrono::duration<float, std::micro>(stop_memcpy - start_memcpy).count();
+    time += downloadTime;
+    std::cout << "Download Time is " << downloadTime << " us" << std::endl;
+#endif
+  }
+  return downloadTime;
+}
diff --git a/src/cudaImage.h b/src/cudaImage.h
new file mode 100644
index 0000000..0ce1a92
--- /dev/null
+++ b/src/cudaImage.h
@@ -0,0 +1,59 @@
+//********************************************************//
+// CUDA SIFT extractor by Marten Bjorkman aka Celebrandil //
+//********************************************************//
+
+// Modifications Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+
+#ifndef CUDAIMAGE_H
+#define CUDAIMAGE_H
+
+#include <stddef.h>
+#include <sycl/sycl.hpp>
+
+class CudaImage
+{
+public:
+  int width, height;
+  size_t pitch;
+  float *h_data;
+  float *d_data;
+  bool d_internalAlloc;
+  bool h_internalAlloc;
+
+public:
+  CudaImage();
+  CudaImage(const CudaImage &) = delete;
+  CudaImage &operator=(const CudaImage &) = delete;
+  ~CudaImage();
+  void Allocate(int width, int height, int pitch, bool withHost, sycl::queue &q_ct, float &totTime, float *devMem = NULL, float *hostMem = NULL);
+  double Download(sycl::queue &q_ct, float &totTime);
+};
+
+int iDivUp(int a, int b);
+int iDivDown(int a, int b);
+int iAlignUp(int a, int b);
+int iAlignDown(int a, int b);
+void StartTimer(unsigned int *hTimer);
+double StopTimer(unsigned int hTimer);
+
+#endif // CUDAIMAGE_H
diff --git a/src/cudaSift.h b/src/cudaSift.h
new file mode 100644
index 0000000..8bdada3
--- /dev/null
+++ b/src/cudaSift.h
@@ -0,0 +1,87 @@
+// Modifications Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+
+#ifndef CUDASIFT_H
+#define CUDASIFT_H
+
+#include "cudaImage.h"
+
+struct rawImg_data
+{
+  float *raw_d_data;
+  int pitch;
+
+  void set_pitch(int pitch)
+  {
+    this->pitch = pitch;
+  }
+
+  float read(float xf, float yf)
+  {
+    int xi = xf;
+    int yi = yf;
+    return *(raw_d_data + yi * pitch + xi);
+  }
+};
+
+typedef struct
+{
+  float xpos;
+  float ypos;
+  float scale;
+  float sharpness;
+  float edgeness;
+  float orientation;
+  float score;
+  float ambiguity;
+  int match;
+  float match_xpos;
+  float match_ypos;
+  float match_error;
+  float subsampling;
+  float empty[3];
+  float data[128];
+} SiftPoint;
+
+typedef struct
+{
+  int numPts; // Number of available Sift points
+  int maxPts; // Number of allocated Sift points
+#ifdef MANAGEDMEM
+  SiftPoint *m_data; // Managed data
+#else
+  SiftPoint *h_data; // Host (CPU) data
+  SiftPoint *d_data; // Device (GPU) data
+#endif
+} SiftData;
+
+void InitCuda(sycl::queue &q_ct, int devNum = 0);
+float *AllocSiftTempMemory(int width, int height, int numOctaves, sycl::queue &q_ct, float &totTime, bool scaleUp = false);
+void FreeSiftTempMemory(float *memoryTmp, sycl::queue &q_ct);
+void ExtractSift(SiftData &siftData, CudaImage &img, int numOctaves, double initBlur, float thresh, sycl::queue &q_ct, float &totTime, float lowestScale = 0.0f, bool scaleUp = false, float *tempMemory = 0);
+void InitSiftData(SiftData &data, sycl::queue &q_ct, float &totTime, int num = 1024, bool host = false, bool dev = true);
+void FreeSiftData(SiftData &data, sycl::queue &q_ct);
+void PrintSiftData(SiftData &data, sycl::queue &q_ct);
+double MatchSiftData(SiftData &data1, SiftData &data2, sycl::queue &q_ct, float &time);
+double FindHomography(SiftData &data, float *homography, int *numMatches, sycl::queue &q_ct, float &time, int numLoops = 1000, float minScore = 0.85f, float maxAmbiguity = 0.95f, float thresh = 5.0f);
+
+#endif
diff --git a/src/cudaSiftD.dp.cpp b/src/cudaSiftD.dp.cpp
new file mode 100644
index 0000000..8b6d565
--- /dev/null
+++ b/src/cudaSiftD.dp.cpp
@@ -0,0 +1,1374 @@
+// Modifications Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+
+#include <sycl/sycl.hpp>
+
+#include "infra/infra.hpp"
+#include "cudautils.h"
+#include "cudaSiftD.h"
+#include "cudaSift.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// Kernel configuration
+///////////////////////////////////////////////////////////////////////////////
+
+infra::constant_memory<int, 0> d_MaxNumPoints;
+infra::global_memory<unsigned int, 1> d_PointCounter(8 * 2 + 1);
+infra::constant_memory<float, 1> d_ScaleDownKernel(5);
+infra::constant_memory<float, 1> d_LowPassKernel(2 * LOWPASS_R + 1);
+infra::constant_memory<float, 1> d_LaplaceKernel(8 * 12 * 16);
+
+///////////////////////////////////////////////////////////////////////////////
+// Lowpass filter and subsample image
+///////////////////////////////////////////////////////////////////////////////
+void ScaleDownDenseShift(float *d_Result, float *d_Data, int width, int pitch, int height, int newpitch,
+                         sycl::nd_item<3> item_ct1, float *d_ScaleDownKernel,
+                         float *brows)
+{
+#define BW (SCALEDOWN_W + 4)
+#define BH (SCALEDOWN_H + 4)
+#define W2 (SCALEDOWN_W / 2)
+#define H2 (SCALEDOWN_H / 2)
+
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int xp = item_ct1.get_group(2) * SCALEDOWN_W + tx;
+  const int yp = item_ct1.get_group(1) * SCALEDOWN_H + ty;
+  const float k0 = d_ScaleDownKernel[0];
+  const float k1 = d_ScaleDownKernel[1];
+  const float k2 = d_ScaleDownKernel[2];
+  const int xl = sycl::min((int)(width - 1), sycl::max(0, (int)(xp - 2)));
+  const int yl = sycl::min((int)(height - 1), sycl::max(0, (int)(yp - 2)));
+  if (xp < (width + 4) && yp < (height + 4))
+  {
+    float v = d_Data[yl * pitch + xl];
+    brows[BW * ty + tx] =
+        k0 * (v + ShiftDown(v, 4, item_ct1)) +
+        k1 * (ShiftDown(v, 1, item_ct1) + ShiftDown(v, 3, item_ct1)) +
+        k2 * ShiftDown(v, 2, item_ct1);
+  }
+
+  item_ct1.barrier();
+  const int xs = item_ct1.get_group(2) * W2 + tx;
+  const int ys = item_ct1.get_group(1) * H2 + ty;
+  if (tx < W2 && ty < H2 && xs < (width / 2) && ys < (height / 2))
+  {
+    float *ptr = &brows[BW * (ty * 2) + (tx * 2)];
+    d_Result[ys * newpitch + xs] = k0 * (ptr[0] + ptr[4 * BW]) + k1 * (ptr[1 * BW] + ptr[3 * BW]) + k2 * ptr[2 * BW];
+  }
+}
+
+void ScaleDownDense(float *d_Result, float *d_Data, int width, int pitch, int height, int newpitch,
+                    sycl::nd_item<3> item_ct1, float *d_ScaleDownKernel,
+                    float *irows, float *brows)
+{
+#define BW (SCALEDOWN_W + 4)
+#define BH (SCALEDOWN_H + 4)
+#define W2 (SCALEDOWN_W / 2)
+#define H2 (SCALEDOWN_H / 2)
+
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int xp = item_ct1.get_group(2) * SCALEDOWN_W + tx;
+  const int yp = item_ct1.get_group(1) * SCALEDOWN_H + ty;
+  const int xl = sycl::min((int)(width - 1), sycl::max(0, (int)(xp - 2)));
+  const int yl = sycl::min((int)(height - 1), sycl::max(0, (int)(yp - 2)));
+  const float k0 = d_ScaleDownKernel[0];
+  const float k1 = d_ScaleDownKernel[1];
+  const float k2 = d_ScaleDownKernel[2];
+  if (xp < (width + 4) && yp < (height + 4))
+    irows[BW * ty + tx] = d_Data[yl * pitch + xl];
+
+  item_ct1.barrier();
+  if (yp < (height + 4) && tx < W2)
+  {
+    float *ptr = &irows[BW * ty + 2 * tx];
+    brows[W2 * ty + tx] = k0 * (ptr[0] + ptr[4]) + k1 * (ptr[1] + ptr[3]) + k2 * ptr[2];
+  }
+
+  item_ct1.barrier();
+  const int xs = item_ct1.get_group(2) * W2 + tx;
+  const int ys = item_ct1.get_group(1) * H2 + ty;
+  if (tx < W2 && ty < H2 && xs < (width / 2) && ys < (height / 2))
+  {
+    float *ptr = &brows[W2 * (ty * 2) + tx];
+    d_Result[ys * newpitch + xs] = k0 * (ptr[0] + ptr[4 * W2]) + k1 * (ptr[1 * W2] + ptr[3 * W2]) + k2 * ptr[2 * W2];
+  }
+}
+
+void ScaleDown(float *d_Result, float *d_Data, int width, int pitch, int height, int newpitch,
+               sycl::nd_item<3> item_ct1, float *d_ScaleDownKernel, float *inrow,
+               float *brow, int *yRead, int *yWrite)
+{
+
+#define dx2 (SCALEDOWN_W / 2)
+  const int tx = item_ct1.get_local_id(2);
+  const int tx0 = tx + 0 * dx2;
+  const int tx1 = tx + 1 * dx2;
+  const int tx2 = tx + 2 * dx2;
+  const int tx3 = tx + 3 * dx2;
+  const int tx4 = tx + 4 * dx2;
+  const int xStart = item_ct1.get_group(2) * SCALEDOWN_W;
+  const int yStart = item_ct1.get_group(1) * SCALEDOWN_H;
+  const int xWrite = xStart / 2 + tx;
+  float k0 = d_ScaleDownKernel[0];
+  float k1 = d_ScaleDownKernel[1];
+  float k2 = d_ScaleDownKernel[2];
+  if (tx < SCALEDOWN_H + 4)
+  {
+    int y = yStart + tx - 2;
+    y = (y < 0 ? 0 : y);
+    y = (y >= height ? height - 1 : y);
+    yRead[tx] = y * pitch;
+    yWrite[tx] = (yStart + tx - 4) / 2 * newpitch;
+  }
+
+  // item_ct1.barrier();
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+  int xRead = xStart + tx - 2;
+  xRead = (xRead < 0 ? 0 : xRead);
+  xRead = (xRead >= width ? width - 1 : xRead);
+
+  int maxtx = sycl::min(dx2, (int)(width / 2 - xStart / 2));
+  for (int dy = 0; dy < SCALEDOWN_H + 4; dy += 5)
+  {
+    {
+      inrow[tx] = d_Data[yRead[dy + 0] + xRead];
+
+      // item_ct1.barrier();
+      item_ct1.barrier(sycl::access::fence_space::local_space);
+      if (tx < maxtx)
+      {
+        brow[tx4] = k0 * (inrow[2 * tx] + inrow[2 * tx + 4]) + k1 * (inrow[2 * tx + 1] + inrow[2 * tx + 3]) + k2 * inrow[2 * tx + 2];
+        if (dy >= 4 && !(dy & 1))
+          d_Result[yWrite[dy + 0] + xWrite] = k2 * brow[tx2] + k0 * (brow[tx0] + brow[tx4]) + k1 * (brow[tx1] + brow[tx3]);
+      }
+
+      // item_ct1.barrier();
+      item_ct1.barrier(sycl::access::fence_space::local_space);
+    }
+    if (dy < (SCALEDOWN_H + 3))
+    {
+      inrow[tx] = d_Data[yRead[dy + 1] + xRead];
+
+      item_ct1.barrier();
+      if (tx < maxtx)
+      {
+        brow[tx0] = k0 * (inrow[2 * tx] + inrow[2 * tx + 4]) + k1 * (inrow[2 * tx + 1] + inrow[2 * tx + 3]) + k2 * inrow[2 * tx + 2];
+        if (dy >= 3 && (dy & 1))
+          d_Result[yWrite[dy + 1] + xWrite] = k2 * brow[tx3] + k0 * (brow[tx1] + brow[tx0]) + k1 * (brow[tx2] + brow[tx4]);
+      }
+
+      // item_ct1.barrier();
+      item_ct1.barrier(sycl::access::fence_space::local_space);
+    }
+    if (dy < (SCALEDOWN_H + 2))
+    {
+      inrow[tx] = d_Data[yRead[dy + 2] + xRead];
+
+      // item_ct1.barrier();
+      item_ct1.barrier(sycl::access::fence_space::local_space);
+      if (tx < maxtx)
+      {
+        brow[tx1] = k0 * (inrow[2 * tx] + inrow[2 * tx + 4]) + k1 * (inrow[2 * tx + 1] + inrow[2 * tx + 3]) + k2 * inrow[2 * tx + 2];
+        if (dy >= 2 && !(dy & 1))
+          d_Result[yWrite[dy + 2] + xWrite] = k2 * brow[tx4] + k0 * (brow[tx2] + brow[tx1]) + k1 * (brow[tx3] + brow[tx0]);
+      }
+
+      // item_ct1.barrier();
+      item_ct1.barrier(sycl::access::fence_space::local_space);
+    }
+    if (dy < (SCALEDOWN_H + 1))
+    {
+      inrow[tx] = d_Data[yRead[dy + 3] + xRead];
+
+      item_ct1.barrier();
+      if (tx < maxtx)
+      {
+        brow[tx2] = k0 * (inrow[2 * tx] + inrow[2 * tx + 4]) + k1 * (inrow[2 * tx + 1] + inrow[2 * tx + 3]) + k2 * inrow[2 * tx + 2];
+        if (dy >= 1 && (dy & 1))
+          d_Result[yWrite[dy + 3] + xWrite] = k2 * brow[tx0] + k0 * (brow[tx3] + brow[tx2]) + k1 * (brow[tx4] + brow[tx1]);
+      }
+
+      // item_ct1.barrier();
+      item_ct1.barrier(sycl::access::fence_space::local_space);
+    }
+    if (dy < SCALEDOWN_H)
+    {
+      inrow[tx] = d_Data[yRead[dy + 4] + xRead];
+
+      item_ct1.barrier();
+      if (tx < dx2 && xWrite < width / 2)
+      {
+        brow[tx3] = k0 * (inrow[2 * tx] + inrow[2 * tx + 4]) + k1 * (inrow[2 * tx + 1] + inrow[2 * tx + 3]) + k2 * inrow[2 * tx + 2];
+        if (!(dy & 1))
+          d_Result[yWrite[dy + 4] + xWrite] = k2 * brow[tx1] + k0 * (brow[tx4] + brow[tx3]) + k1 * (brow[tx0] + brow[tx2]);
+      }
+
+      // item_ct1.barrier();
+      item_ct1.barrier(sycl::access::fence_space::local_space);
+    }
+  }
+}
+
+void ScaleUp(float *d_Result, float *d_Data, int width, int pitch, int height, int newpitch,
+             sycl::nd_item<3> item_ct1)
+{
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  int x = item_ct1.get_group(2) * SCALEUP_W + 2 * tx;
+  int y = item_ct1.get_group(1) * SCALEUP_H + 2 * ty;
+  if (x < 2 * width && y < 2 * height)
+  {
+    int xl = item_ct1.get_group(2) * (SCALEUP_W / 2) + tx;
+    int yu = item_ct1.get_group(1) * (SCALEUP_H / 2) + ty;
+    int xr = sycl::min((int)(xl + 1), (int)(width - 1));
+    int yd = sycl::min((int)(yu + 1), (int)(height - 1));
+    float vul = d_Data[yu * pitch + xl];
+    float vur = d_Data[yu * pitch + xr];
+    float vdl = d_Data[yd * pitch + xl];
+    float vdr = d_Data[yd * pitch + xr];
+    d_Result[(y + 0) * newpitch + x + 0] = vul;
+    d_Result[(y + 0) * newpitch + x + 1] = 0.50f * (vul + vur);
+    d_Result[(y + 1) * newpitch + x + 0] = 0.50f * (vul + vdl);
+    d_Result[(y + 1) * newpitch + x + 1] = 0.25f * (vul + vur + vdl + vdr);
+  }
+}
+
+float FastAtan2(float y, float x)
+{
+  float absx = sycl::fabs(x);
+  float absy = sycl::fabs(y);
+
+  float a = sycl::min(absx, absy) / sycl::max(absx, absy);
+  float s = a * a;
+  float r = ((-0.0464964749f * s + 0.15931422f) * s - 0.327622764f) * s * a + a;
+  r = (absy > absx ? 1.57079637f - r : r);
+  r = (x < 0 ? 3.14159274f - r : r);
+  r = (y < 0 ? -r : r);
+  return r;
+}
+
+void ExtractSiftDescriptorsCONSTNew(
+
+    float *texObj, int pitch, SiftPoint *d_sift,
+    float subsampling, int octave, sycl::nd_item<3> item_ct1,
+    int d_MaxNumPoints, unsigned int *d_PointCounter, float *gauss,
+    float *buffer, float *sums)
+{
+
+  const int tx = item_ct1.get_local_id(2); // 0 -> 16
+  const int ty = item_ct1.get_local_id(1); // 0 -> 8
+  const int idx = ty * 16 + tx;
+  if (ty == 0)
+    gauss[tx] = sycl::exp(-(tx - 7.5f) * (tx - 7.5f) / 128.0f);
+
+  int fstPts =
+      sycl::min(d_PointCounter[2 * octave - 1], (unsigned int)d_MaxNumPoints);
+  int totPts =
+      sycl::min(d_PointCounter[2 * octave + 1], (unsigned int)d_MaxNumPoints);
+
+  for (int bx = item_ct1.get_group(2) + fstPts; bx < totPts;
+       bx += item_ct1.get_group_range(2))
+  {
+
+    buffer[idx] = 0.0f;
+
+    // item_ct1.barrier();
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    // Compute angles and gradients
+    float theta = 2.0f * 3.1415f / 360.0f * d_sift[bx].orientation;
+    float sina = sycl::sin(theta); // cosa -sina
+    float cosa = sycl::cos(theta); // sina  cosa
+    float scale = 12.0f / 16.0f * d_sift[bx].scale;
+    float ssina = scale * sina;
+    float scosa = scale * cosa;
+
+    for (int y = ty; y < 16; y += 8)
+    {
+      float xpos = d_sift[bx].xpos + (tx - 7.5f) * scosa - (y - 7.5f) * ssina + 0.5f;
+      float ypos = d_sift[bx].ypos + (tx - 7.5f) * ssina + (y - 7.5f) * scosa + 0.5f;
+
+      int xi1 = xpos + cosa;
+      int yi1 = ypos + sina;
+
+      int xi2 = xpos - cosa;
+      int yi2 = ypos - sina;
+
+      float dx = *(texObj + yi1 * pitch + xi1) -
+                 *(texObj + yi2 * pitch + xi2);
+
+      xi1 = xpos - sina;
+      yi1 = ypos + cosa;
+
+      xi2 = xpos + sina;
+      yi2 = ypos - cosa;
+
+      float dy = *(texObj + yi1 * pitch + xi1) -
+                 *(texObj + yi2 * pitch + xi2);
+      float grad = gauss[y] * gauss[tx] * sycl::sqrt(dx * dx + dy * dy);
+      float angf = 4.0f / 3.1415f * FastAtan2(dy, dx) + 4.0f;
+
+      int hori = (tx + 2) / 4 - 1; // Convert from (tx,y,angle) to bins
+      float horf = (tx - 1.5f) / 4.0f - hori;
+      float ihorf = 1.0f - horf;
+      int veri = (y + 2) / 4 - 1;
+      float verf = (y - 1.5f) / 4.0f - veri;
+      float iverf = 1.0f - verf;
+      int angi = angf;
+      int angp = (angi < 7 ? angi + 1 : 0);
+      angf -= angi;
+      float iangf = 1.0f - angf;
+
+      int hist = 8 * (4 * veri + hori); // Each gradient measure is interpolated
+      int p1 = angi + hist;             // in angles, xpos and ypos -> 8 stores
+      int p2 = angp + hist;
+      if (tx >= 2)
+      {
+        float grad1 = ihorf * grad;
+        if (y >= 2)
+        { // Upper left
+          float grad2 = iverf * grad1;
+          infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+              buffer + p1, iangf * grad2);
+          infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+              buffer + p2, angf * grad2);
+        }
+        if (y <= 13)
+        { // Lower left
+          float grad2 = verf * grad1;
+          infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+              buffer + p1 + 32, iangf * grad2);
+          infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+              buffer + p2 + 32, angf * grad2);
+        }
+      }
+      if (tx <= 13)
+      {
+        float grad1 = horf * grad;
+        if (y >= 2)
+        { // Upper right
+          float grad2 = iverf * grad1;
+          infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+              buffer + p1 + 8, iangf * grad2);
+          infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+              buffer + p2 + 8, angf * grad2);
+        }
+        if (y <= 13)
+        { // Lower right
+          float grad2 = verf * grad1;
+          infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+              buffer + p1 + 40, iangf * grad2);
+          infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+              buffer + p2 + 40, angf * grad2);
+        }
+      }
+    }
+    // item_ct1.barrier();
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    // Normalize twice and suppress peaks first time
+    float sum = buffer[idx] * buffer[idx];
+    for (int i = 16; i > 0; i /= 2)
+      sum += ShiftDown(sum, i, item_ct1);
+    if ((idx & 31) == 0)
+      sums[idx / 32] = sum;
+    item_ct1.barrier();
+    float tsum1 = sums[0] + sums[1] + sums[2] + sums[3];
+    tsum1 = sycl::min((float)(buffer[idx] * sycl::rsqrt(tsum1)), 0.2f);
+
+    sum = tsum1 * tsum1;
+    for (int i = 16; i > 0; i /= 2)
+      sum += ShiftDown(sum, i, item_ct1);
+    if ((idx & 31) == 0)
+      sums[idx / 32] = sum;
+    // item_ct1.barrier();
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    float tsum2 = sums[0] + sums[1] + sums[2] + sums[3];
+    float *desc = d_sift[bx].data;
+    desc[idx] = tsum1 * sycl::rsqrt(tsum2);
+    if (idx == 0)
+    {
+      d_sift[bx].xpos *= subsampling;
+      d_sift[bx].ypos *= subsampling;
+      d_sift[bx].scale *= subsampling;
+    }
+    // item_ct1.barrier();
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+  }
+}
+
+void ExtractSiftDescriptor(rawImg_data texObj,
+                           SiftPoint *d_sift, float subsampling, int octave,
+                           int bx, sycl::nd_item<3> item_ct1, float *gauss,
+                           float *buffer, float *sums)
+{
+
+  const int idx = item_ct1.get_local_id(2);
+  const int tx = idx & 15; // 0 -> 16
+  const int ty = idx / 16; // 0 -> 8
+  if (ty == 0)
+    gauss[tx] = sycl::exp(-(tx - 7.5f) * (tx - 7.5f) / 128.0f);
+  buffer[idx] = 0.0f;
+  item_ct1.barrier();
+
+  // Compute angles and gradients
+  float theta = 2.0f * 3.1415f / 360.0f * d_sift[bx].orientation;
+  float sina = sycl::sin(theta); // cosa -sina
+  float cosa = sycl::cos(theta); // sina  cosa
+  float scale = 12.0f / 16.0f * d_sift[bx].scale;
+  float ssina = scale * sina;
+  float scosa = scale * cosa;
+
+  for (int y = ty; y < 16; y += 8)
+  {
+    float xpos = d_sift[bx].xpos + (tx - 7.5f) * scosa - (y - 7.5f) * ssina + 0.5f;
+    float ypos = d_sift[bx].ypos + (tx - 7.5f) * ssina + (y - 7.5f) * scosa + 0.5f;
+    float dx = texObj.read(xpos + cosa, ypos + sina) -
+               texObj.read(xpos - cosa, ypos - sina);
+    float dy = texObj.read(xpos - sina, ypos + cosa) -
+               texObj.read(xpos + sina, ypos - cosa);
+    float grad = gauss[y] * gauss[tx] * sycl::sqrt(dx * dx + dy * dy);
+    float angf = 4.0f / 3.1415f * sycl::atan2(dy, dx) + 4.0f;
+
+    int hori = (tx + 2) / 4 - 1; // Convert from (tx,y,angle) to bins
+    float horf = (tx - 1.5f) / 4.0f - hori;
+    float ihorf = 1.0f - horf;
+    int veri = (y + 2) / 4 - 1;
+    float verf = (y - 1.5f) / 4.0f - veri;
+    float iverf = 1.0f - verf;
+    int angi = angf;
+    int angp = (angi < 7 ? angi + 1 : 0);
+    angf -= angi;
+    float iangf = 1.0f - angf;
+
+    int hist = 8 * (4 * veri + hori); // Each gradient measure is interpolated
+    int p1 = angi + hist;             // in angles, xpos and ypos -> 8 stores
+    int p2 = angp + hist;
+    if (tx >= 2)
+    {
+      float grad1 = ihorf * grad;
+      if (y >= 2)
+      { // Upper left
+        float grad2 = iverf * grad1;
+        infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+            buffer + p1, iangf * grad2);
+        infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+            buffer + p2, angf * grad2);
+      }
+      if (y <= 13)
+      { // Lower left
+        float grad2 = verf * grad1;
+        infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+            buffer + p1 + 32, iangf * grad2);
+        infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+            buffer + p2 + 32, angf * grad2);
+      }
+    }
+    if (tx <= 13)
+    {
+      float grad1 = horf * grad;
+      if (y >= 2)
+      { // Upper right
+        float grad2 = iverf * grad1;
+        infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+            buffer + p1 + 8, iangf * grad2);
+        infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+            buffer + p2 + 8, angf * grad2);
+      }
+      if (y <= 13)
+      { // Lower right
+        float grad2 = verf * grad1;
+        infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+            buffer + p1 + 40, iangf * grad2);
+        infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+            buffer + p2 + 40, angf * grad2);
+      }
+    }
+  }
+
+  item_ct1.barrier();
+
+  // Normalize twice and suppress peaks first time
+  float sum = buffer[idx] * buffer[idx];
+  for (int i = 16; i > 0; i /= 2)
+    sum += ShiftDown(sum, i, item_ct1);
+  if ((idx & 31) == 0)
+    sums[idx / 32] = sum;
+
+  item_ct1.barrier();
+  float tsum1 = sums[0] + sums[1] + sums[2] + sums[3];
+  tsum1 = sycl::min((float)(buffer[idx] * sycl::rsqrt(tsum1)), 0.2f);
+
+  sum = tsum1 * tsum1;
+  for (int i = 16; i > 0; i /= 2)
+    sum += ShiftDown(sum, i, item_ct1);
+  if ((idx & 31) == 0)
+    sums[idx / 32] = sum;
+
+  item_ct1.barrier();
+
+  float tsum2 = sums[0] + sums[1] + sums[2] + sums[3];
+  float *desc = d_sift[bx].data;
+  desc[idx] = tsum1 * sycl::rsqrt(tsum2);
+  if (idx == 0)
+  {
+    d_sift[bx].xpos *= subsampling;
+    d_sift[bx].ypos *= subsampling;
+    d_sift[bx].scale *= subsampling;
+  }
+
+  item_ct1.barrier();
+}
+
+void RescalePositions(SiftPoint *d_sift, int numPts, float scale,
+                      sycl::nd_item<3> item_ct1)
+{
+  int num = item_ct1.get_group(2) * item_ct1.get_local_range().get(2) +
+            item_ct1.get_local_id(2);
+  if (num < numPts)
+  {
+    d_sift[num].xpos *= scale;
+    d_sift[num].ypos *= scale;
+    d_sift[num].scale *= scale;
+  }
+}
+
+// With constant number of blocks
+void ComputeOrientationsCONSTNew(float *image, int w, int p, int h, SiftPoint *d_Sift, int octave,
+                                 sycl::nd_item<3> item_ct1, int d_MaxNumPoints,
+                                 unsigned int *d_PointCounter,
+                                 sycl::accessor<float, 2, sycl::access_mode::read_write, sycl::access::target::local> img,
+                                 sycl::accessor<float, 2, sycl::access_mode::read_write, sycl::access::target::local> tmp,
+                                 float *hist, float *gaussx, float *gaussy)
+{
+#define RAD 9
+#define WID (2 * RAD + 1)
+#define LEN 32 //%%%% Note: Lowe suggests 36, not 32
+
+  const int tx = item_ct1.get_local_id(2);
+
+  int fstPts =
+      sycl::min(d_PointCounter[2 * octave - 1], (unsigned int)d_MaxNumPoints);
+  int totPts =
+      sycl::min(d_PointCounter[2 * octave + 0], (unsigned int)d_MaxNumPoints);
+  for (int bx = item_ct1.get_group(2) + fstPts; bx < totPts;
+       bx += item_ct1.get_group_range(2))
+  {
+
+    float sc = d_Sift[bx].scale;
+    for (int i = tx; i < 2 * LEN; i += item_ct1.get_local_range().get(2))
+      hist[i] = 0.0f;
+    float xp = d_Sift[bx].xpos;
+    float yp = d_Sift[bx].ypos;
+    int xi = (int)xp;
+    int yi = (int)yp;
+    float xf = xp - xi;
+    float yf = yp - yi;
+    for (int i = tx; i < WID * WID; i += item_ct1.get_local_range().get(2))
+    {
+      int y = i / WID;
+      int x = i - y * WID;
+      int xp = sycl::max(sycl::min((int)(x - RAD + xi), (int)(w - 1)), 0);
+      int yp = sycl::max(sycl::min((int)(y - RAD + yi), (int)(h - 1)), 0);
+      img[y][x] = image[yp * p + xp];
+    }
+    float fac[5];
+    fac[1] = fac[3] =
+        (sc > 0.5f ? sycl::exp(-1.0f / (2.0f * (sc * sc - 0.25f))) : 0.0f);
+    fac[0] = fac[4] =
+        (sc > 0.5f ? sycl::exp(-4.0f / (2.0f * (sc * sc - 0.25f))) : 0.0f);
+    fac[2] = 1.0f;
+    float i2sigma2 = -1.0f / (2.0f * 2.0f * 2.0f * sc * sc); //%%%% Note: Lowe suggests 1.5, not 2.0
+    if (tx < WID)
+    {
+      gaussx[tx] = sycl::exp(i2sigma2 * (tx - RAD - xf) * (tx - RAD - xf));
+      gaussy[tx] = sycl::exp(i2sigma2 * (tx - RAD - yf) * (tx - RAD - yf));
+    }
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    for (int i = tx; i < (WID - 4) * WID;
+         i += item_ct1.get_local_range().get(2))
+    {
+      int y = i / WID;
+      int x = i - y * WID;
+      y += 2;
+      tmp[y][x] = img[y][x] + fac[1] * (img[y - 1][x] + img[y + 1][x]) +
+                  fac[0] * (img[y - 2][x] + img[y + 2][x]);
+    }
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    for (int i = tx; i < (WID - 4) * (WID - 4);
+         i += item_ct1.get_local_range().get(2))
+    {
+      int y = i / (WID - 4);
+      int x = i - y * (WID - 4);
+      x += 2;
+      y += 2;
+      img[y][x] = tmp[y][x] + fac[1] * (tmp[y][x - 1] + tmp[y][x + 1]) +
+                  fac[0] * (tmp[y][x - 2] + tmp[y][x + 2]);
+    }
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    for (int i = tx; i < (WID - 6) * (WID - 6);
+         i += item_ct1.get_local_range().get(2))
+    {
+      int y = i / (WID - 6);
+      int x = i - y * (WID - 6);
+      x += 3;
+      y += 3;
+      float dx = img[y][x + 1] - img[y][x - 1];
+      float dy = img[y + 1][x] - img[y - 1][x];
+      int bin =
+          (int)((LEN / 2) * sycl::atan2(dy, dx) / 3.1416f + (LEN / 2) + 0.5f) %
+          LEN;
+      float grad = sycl::sqrt(dx * dx + dy * dy);
+      infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+          &hist[LEN + bin], grad * gaussx[x] * gaussy[y]);
+    }
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    int x1m = (tx >= 1 ? tx - 1 : tx + LEN - 1);
+    int x1p = (tx < (LEN - 1) ? tx + 1 : tx - LEN + 1);
+    int x2m = (tx >= 2 ? tx - 2 : tx + LEN - 2);
+    int x2p = (tx < (LEN - 2) ? tx + 2 : tx - LEN + 2);
+    if (tx < LEN)
+    {
+      hist[tx] = 6.0f * hist[tx + LEN] + 4.0f * (hist[x1m + LEN] + hist[x1p + LEN]) +
+                 1.0f * (hist[x2m + LEN] + hist[x2p + LEN]);
+      hist[tx + LEN] = 8.0f * hist[tx] + 4.0f * (hist[x1m] + hist[x1p]) +
+                       0.0f * (hist[x2m] + hist[x2p]);
+      float val = hist[tx + LEN];
+      hist[tx] = (val > hist[x1m + LEN] && val >= hist[x1p + LEN] ? val : 0.0f);
+    }
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    if (tx == 0)
+    {
+      float maxval1 = 0.0;
+      float maxval2 = 0.0;
+      int i1 = -1;
+      int i2 = -1;
+      for (int i = 0; i < LEN; i++)
+      {
+        float v = hist[i];
+        if (v > maxval1)
+        {
+          maxval2 = maxval1;
+          maxval1 = v;
+          i2 = i1;
+          i1 = i;
+        }
+        else if (v > maxval2)
+        {
+          maxval2 = v;
+          i2 = i;
+        }
+      }
+      float val1 = hist[LEN + ((i1 + 1) % LEN)];
+      float val2 = hist[LEN + ((i1 + LEN - 1) % LEN)];
+      float peak = i1 + 0.5f * (val1 - val2) / (2.0f * maxval1 - val1 - val2);
+      d_Sift[bx].orientation = 360.0f * (peak < 0.0f ? peak + LEN : peak) / LEN;
+      sycl::atomic<unsigned int>(
+          sycl::global_ptr<unsigned int>(&d_PointCounter[2 * octave + 1]))
+          .fetch_max(d_PointCounter[2 * octave + 0]);
+      if (maxval2 > 0.8f * maxval1 && true)
+      {
+        float val1 = hist[LEN + ((i2 + 1) % LEN)];
+        float val2 = hist[LEN + ((i2 + LEN - 1) % LEN)];
+        float peak = i2 + 0.5f * (val1 - val2) / (2.0f * maxval2 - val1 - val2);
+        unsigned int idx = infra::atomic_fetch_compare_inc(
+            &d_PointCounter[2 * octave + 1], (unsigned int)0x7fffffff);
+        if (idx < d_MaxNumPoints)
+        {
+          d_Sift[idx].xpos = d_Sift[bx].xpos;
+          d_Sift[idx].ypos = d_Sift[bx].ypos;
+          d_Sift[idx].scale = sc;
+          d_Sift[idx].sharpness = d_Sift[bx].sharpness;
+          d_Sift[idx].edgeness = d_Sift[bx].edgeness;
+          d_Sift[idx].orientation = 360.0f * (peak < 0.0f ? peak + LEN : peak) / LEN;
+          d_Sift[idx].subsampling = d_Sift[bx].subsampling;
+        }
+      }
+    }
+  }
+#undef RAD
+#undef WID
+#undef LEN
+}
+
+// With constant number of blocks
+void ComputeOrientationsCONST(rawImg_data texObj,
+                              SiftPoint *d_Sift, int octave,
+                              sycl::nd_item<3> item_ct1, int d_MaxNumPoints,
+                              unsigned int *d_PointCounter, float *hist,
+                              float *gauss)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+
+  int fstPts =
+      sycl::min(d_PointCounter[2 * octave - 1], (unsigned int)d_MaxNumPoints);
+  int totPts =
+      sycl::min(d_PointCounter[2 * octave + 0], (unsigned int)d_MaxNumPoints);
+  for (int bx = item_ct1.get_group(2) + fstPts; bx < totPts;
+       bx += item_ct1.get_group_range(2))
+  {
+
+    float i2sigma2 = -1.0f / (2.0f * 1.5f * 1.5f * d_Sift[bx].scale * d_Sift[bx].scale);
+    if (tx < 11)
+      gauss[tx] = sycl::exp(i2sigma2 * (tx - 5) * (tx - 5));
+    if (tx < 64)
+      hist[tx] = 0.0f;
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    float xp = d_Sift[bx].xpos - 4.5f;
+    float yp = d_Sift[bx].ypos - 4.5f;
+    int yd = tx / 11;
+    int xd = tx - yd * 11;
+    float xf = xp + xd;
+    float yf = yp + yd;
+    if (yd < 11)
+    {
+      float dx = texObj.read(xf + 1.0, yf) - texObj.read(xf - 1.0, yf); // src_d_data[yf * pitch + xf]
+      float dy = texObj.read(xf, yf + 1.0) - texObj.read(xf, yf - 1.0);
+      int bin = 16.0f * sycl::atan2(dy, dx) / 3.1416f + 16.5f;
+      if (bin > 31)
+        bin = 0;
+      float grad = sycl::sqrt(dx * dx + dy * dy);
+      infra::atomic_fetch_add<sycl::access::address_space::local_space>(
+          &hist[bin], grad * gauss[xd] * gauss[yd]);
+    }
+
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    int x1m = (tx >= 1 ? tx - 1 : tx + 31);
+    int x1p = (tx <= 30 ? tx + 1 : tx - 31);
+    if (tx < 32)
+    {
+      int x2m = (tx >= 2 ? tx - 2 : tx + 30);
+      int x2p = (tx <= 29 ? tx + 2 : tx - 30);
+      hist[tx + 32] = 6.0f * hist[tx] + 4.0f * (hist[x1m] + hist[x1p]) + (hist[x2m] + hist[x2p]);
+    }
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    if (tx < 32)
+    {
+      float v = hist[32 + tx];
+      hist[tx] = (v > hist[32 + x1m] && v >= hist[32 + x1p] ? v : 0.0f);
+    }
+
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    if (tx == 0)
+    {
+      float maxval1 = 0.0;
+      float maxval2 = 0.0;
+      int i1 = -1;
+      int i2 = -1;
+      for (int i = 0; i < 32; i++)
+      {
+        float v = hist[i];
+        if (v > maxval1)
+        {
+          maxval2 = maxval1;
+          maxval1 = v;
+          i2 = i1;
+          i1 = i;
+        }
+        else if (v > maxval2)
+        {
+          maxval2 = v;
+          i2 = i;
+        }
+      }
+      float val1 = hist[32 + ((i1 + 1) & 31)];
+      float val2 = hist[32 + ((i1 + 31) & 31)];
+      float peak = i1 + 0.5f * (val1 - val2) / (2.0f * maxval1 - val1 - val2);
+      d_Sift[bx].orientation = 11.25f * (peak < 0.0f ? peak + 32.0f : peak);
+      sycl::atomic<unsigned int>(
+          sycl::global_ptr<unsigned int>(&d_PointCounter[2 * octave + 1]))
+          .fetch_max(d_PointCounter[2 * octave + 0]);
+      if (maxval2 > 0.8f * maxval1 && true)
+      {
+        float val1 = hist[32 + ((i2 + 1) & 31)];
+        float val2 = hist[32 + ((i2 + 31) & 31)];
+        float peak = i2 + 0.5f * (val1 - val2) / (2.0f * maxval2 - val1 - val2);
+        unsigned int idx = infra::atomic_fetch_compare_inc(
+            &d_PointCounter[2 * octave + 1], (unsigned int)0x7fffffff);
+        if (idx < d_MaxNumPoints)
+        {
+          d_Sift[idx].xpos = d_Sift[bx].xpos;
+          d_Sift[idx].ypos = d_Sift[bx].ypos;
+          d_Sift[idx].scale = d_Sift[bx].scale;
+          d_Sift[idx].sharpness = d_Sift[bx].sharpness;
+          d_Sift[idx].edgeness = d_Sift[bx].edgeness;
+          d_Sift[idx].orientation = 11.25f * (peak < 0.0f ? peak + 32.0f : peak);
+          ;
+          d_Sift[idx].subsampling = d_Sift[bx].subsampling;
+        }
+      }
+    }
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+  }
+}
+
+void FindPointsMultiNew(float *d_Data0, SiftPoint *d_Sift, int width, int pitch, int height, float subsampling, float lowestScale, float thresh, float factor, float edgeLimit, int octave,
+                        sycl::nd_item<3> item_ct1, int d_MaxNumPoints,
+                        unsigned int *d_PointCounter, unsigned short *points)
+{
+#define MEMWID (MINMAX_W + 2)
+
+  if (item_ct1.get_group(2) == 0 && item_ct1.get_group(1) == 0 &&
+      item_ct1.get_local_id(2) == 0)
+  {
+    sycl::atomic<unsigned int>(
+        sycl::global_ptr<unsigned int>(&d_PointCounter[2 * octave + 0]))
+        .fetch_max(d_PointCounter[2 * octave - 1]);
+    sycl::atomic<unsigned int>(
+        sycl::global_ptr<unsigned int>(&d_PointCounter[2 * octave + 1]))
+        .fetch_max(d_PointCounter[2 * octave - 1]);
+  }
+  int tx = item_ct1.get_local_id(2);
+  int block = item_ct1.get_group(2) / NUM_SCALES;
+  int scale = item_ct1.get_group(2) - NUM_SCALES * block;
+  int minx = block * MINMAX_W;
+  int maxx = sycl::min((int)(minx + MINMAX_W), width);
+  int xpos = minx + tx;
+  int size = pitch * height;
+  int ptr =
+      size * scale + sycl::max(sycl::min((int)(xpos - 1), (int)(width - 1)), 0);
+
+  int yloops =
+      sycl::min((unsigned int)(height - MINMAX_H * item_ct1.get_group(1)),
+                (unsigned int)(MINMAX_H));
+  float maxv = 0.0f;
+  for (int y = 0; y < yloops; y++)
+  {
+    int ypos = MINMAX_H * item_ct1.get_group(1) + y;
+    int yptr1 = ptr + ypos * pitch;
+    float val = d_Data0[yptr1 + 1 * size];
+    maxv = sycl::fmax(maxv, sycl::fabs(val));
+  }
+  // if (tx==0) printf("XXX1\n");
+  if (!sycl::any_of_group(
+          item_ct1.get_sub_group(),
+          (0xffffffff &
+           (0x1 << item_ct1.get_sub_group().get_local_linear_id())) &&
+              maxv > thresh))
+    return;
+  // if (tx==0) printf("XXX2\n");
+
+  int ptbits = 0;
+  for (int y = 0; y < yloops; y++)
+  {
+
+    int ypos = MINMAX_H * item_ct1.get_group(1) + y;
+    int yptr1 = ptr + ypos * pitch;
+    float d11 = d_Data0[yptr1 + 1 * size];
+    if (sycl::any_of_group(
+            item_ct1.get_sub_group(),
+            (0xffffffff &
+             (0x1 << item_ct1.get_sub_group().get_local_linear_id())) &&
+                sycl::fabs(d11) > thresh))
+    {
+
+      int yptr0 = ptr + sycl::max(0, (int)(ypos - 1)) * pitch;
+      int yptr2 = ptr + sycl::min((int)(height - 1), (int)(ypos + 1)) * pitch;
+      float d01 = d_Data0[yptr1];
+      float d10 = d_Data0[yptr0 + 1 * size];
+      float d12 = d_Data0[yptr2 + 1 * size];
+      float d21 = d_Data0[yptr1 + 2 * size];
+
+      float d00 = d_Data0[yptr0];
+      float d02 = d_Data0[yptr2];
+      float ymin1 = sycl::fmin(sycl::fmin(d00, d01), d02);
+      float ymax1 = sycl::fmax(sycl::fmax(d00, d01), d02);
+      float d20 = d_Data0[yptr0 + 2 * size];
+      float d22 = d_Data0[yptr2 + 2 * size];
+      float ymin3 = sycl::fmin(sycl::fmin(d20, d21), d22);
+      float ymax3 = sycl::fmax(sycl::fmax(d20, d21), d22);
+      float ymin2 = sycl::fmin(
+          sycl::fmin(ymin1, sycl::fmin(sycl::fmin(d10, d12), d11)), ymin3);
+      float ymax2 = sycl::fmax(
+          sycl::fmax(ymax1, sycl::fmax(sycl::fmax(d10, d12), d11)), ymax3);
+
+      // float nmin2 = sycl::fmin(ShiftUp(ymin2, 1), ShiftDown(ymin2, 1));
+      // float nmax2 = sycl::fmax(ShiftUp(ymax2, 1), ShiftDown(ymax2, 1));
+
+      float nmin2 = sycl::fmin(ShiftUp(ymin2, 1, item_ct1), ShiftDown(ymin2, 1, item_ct1));
+      float nmax2 = sycl::fmax(ShiftUp(ymax2, 1, item_ct1), ShiftDown(ymax2, 1, item_ct1));
+
+      float minv = sycl::fmin(sycl::fmin(nmin2, ymin1), ymin3);
+      minv = sycl::fmin(sycl::fmin(minv, d10), d12);
+      float maxv = sycl::fmax(sycl::fmax(nmax2, ymax1), ymax3);
+      maxv = sycl::fmax(sycl::fmax(maxv, d10), d12);
+
+      if (tx > 0 && tx < MINMAX_W + 1 && xpos <= maxx)
+        ptbits |= ((d11 < sycl::fmin(-thresh, minv)) |
+                   (d11 > sycl::fmax(thresh, maxv)))
+                  << y;
+    }
+  }
+
+  unsigned int totbits = sycl::popcount(ptbits);
+  unsigned int numbits = totbits;
+  for (int d = 1; d < 32; d <<= 1)
+  {
+    unsigned int num = ShiftUp(totbits, d, item_ct1);
+    if (tx >= d)
+      totbits += num;
+  }
+  int pos = totbits - numbits;
+  for (int y = 0; y < yloops; y++)
+  {
+    int ypos = MINMAX_H * item_ct1.get_group(1) + y;
+    if (ptbits & (1 << y) && pos < MEMWID)
+    {
+      points[2 * pos + 0] = xpos - 1;
+      points[2 * pos + 1] = ypos;
+      pos++;
+    }
+  }
+
+  totbits = Shuffle(totbits, 31, item_ct1);
+  if (tx < totbits)
+  {
+    int xpos = points[2 * tx + 0];
+    int ypos = points[2 * tx + 1];
+    int ptr = xpos + (ypos + (scale + 1) * height) * pitch;
+    float val = d_Data0[ptr];
+    float *data1 = &d_Data0[ptr];
+    float dxx = 2.0f * val - data1[-1] - data1[1];
+    float dyy = 2.0f * val - data1[-pitch] - data1[pitch];
+    float dxy = 0.25f * (data1[+pitch + 1] + data1[-pitch - 1] - data1[-pitch + 1] - data1[+pitch - 1]);
+    float tra = dxx + dyy;
+    float det = dxx * dyy - dxy * dxy;
+    if (tra * tra < edgeLimit * det)
+    {
+      float edge = (tra * tra) / det;
+      float dx = 0.5f * (data1[1] - data1[-1]);
+      float dy = 0.5f * (data1[pitch] - data1[-pitch]);
+      float *data0 = d_Data0 + ptr - height * pitch;
+      float *data2 = d_Data0 + ptr + height * pitch;
+      float ds = 0.5f * (data0[0] - data2[0]);
+      float dss = 2.0f * val - data2[0] - data0[0];
+      float dxs = 0.25f * (data2[1] + data0[-1] - data0[1] - data2[-1]);
+      float dys = 0.25f * (data2[pitch] + data0[-pitch] - data2[-pitch] - data0[pitch]);
+      float idxx = dyy * dss - dys * dys;
+      float idxy = dys * dxs - dxy * dss;
+      float idxs = dxy * dys - dyy * dxs;
+      float idet = 1.0f / (idxx * dxx + idxy * dxy + idxs * dxs);
+      float idyy = dxx * dss - dxs * dxs;
+      float idys = dxy * dxs - dxx * dys;
+      float idss = dxx * dyy - dxy * dxy;
+      float pdx = idet * (idxx * dx + idxy * dy + idxs * ds);
+      float pdy = idet * (idxy * dx + idyy * dy + idys * ds);
+      float pds = idet * (idxs * dx + idys * dy + idss * ds);
+      if (pdx < -0.5f || pdx > 0.5f || pdy < -0.5f || pdy > 0.5f || pds < -0.5f || pds > 0.5f)
+      {
+        pdx = dx / dxx;
+        pdy = dy / dyy;
+        pds = ds / dss;
+      }
+      float dval = 0.5f * (dx * pdx + dy * pdy + ds * pds);
+      int maxPts = d_MaxNumPoints;
+      float sc = sycl::pow<float>(2.0f, (float)scale / NUM_SCALES) *
+                 sycl::exp2(pds * factor);
+      if (sc >= lowestScale)
+      {
+        sycl::atomic<unsigned int>(
+            sycl::global_ptr<unsigned int>(&d_PointCounter[2 * octave + 0]))
+            .fetch_max(d_PointCounter[2 * octave - 1]);
+        unsigned int idx = infra::atomic_fetch_compare_inc(
+            &d_PointCounter[2 * octave + 0], (unsigned int)0x7fffffff);
+        idx = (idx >= maxPts ? maxPts - 1 : idx);
+        d_Sift[idx].xpos = xpos + pdx;
+        d_Sift[idx].ypos = ypos + pdy;
+        d_Sift[idx].scale = sc;
+        d_Sift[idx].sharpness = val + dval;
+        d_Sift[idx].edgeness = edge;
+        d_Sift[idx].subsampling = subsampling;
+      }
+    }
+  }
+}
+
+void LaplaceMultiMem(float *d_Image, float *d_Result, int width, int pitch, int height, int octave, sycl::nd_item<3> item_ct1, float *d_LaplaceKernel, float *buff)
+{
+  const int tx = item_ct1.get_local_id(2);
+  const int xp = item_ct1.get_group(2) * LAPLACE_W + tx;
+  const int yp = item_ct1.get_group(1);
+  float *data = d_Image + sycl::max(sycl::min((int)(xp - LAPLACE_R), (int)(width - 1)), 0);
+  float temp[2 * LAPLACE_R + 1];
+
+  float kern[LAPLACE_S][LAPLACE_R + 1];
+  // float kern[LAPLACE_S * (LAPLACE_R + 1)]; // 2d to 1d
+
+  // float kern_temp[LAPLACE_S * (LAPLACE_R + 1)];
+
+  if (xp < (width + 2 * LAPLACE_R))
+  {
+    for (int i = 0; i <= 2 * LAPLACE_R; i++)
+      temp[i] = data[sycl::max(0, sycl::min((int)(yp + i - LAPLACE_R),
+                                            (int)(height - 1))) *
+                     pitch];
+    for (int scale = 0; scale < LAPLACE_S; scale++)
+    {
+      float *buf = buff + (LAPLACE_W + 2 * LAPLACE_R) * scale;
+      float *kernel = d_LaplaceKernel + octave * 12 * 16 + scale * 16;
+      kern[scale][0] = kernel[0];
+      kern[scale][1] = kernel[1];
+      kern[scale][2] = kernel[2];
+      kern[scale][3] = kernel[3];
+      kern[scale][4] = kernel[4];
+
+      float sum = kern[scale][0] * temp[LAPLACE_R];
+
+      // #pragma unroll
+      for (int j = 1; j <= LAPLACE_R; j++)
+        sum += kern[scale][j] * (temp[LAPLACE_R - j] + temp[LAPLACE_R + j]);
+      // sum += kern_temp[scale * LAPLACE_S + j] * (temp[LAPLACE_R - j] + temp[LAPLACE_R + j]);
+      buf[tx] = sum;
+    }
+  }
+
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+  if (tx < LAPLACE_W && xp < (width + 2 * LAPLACE_R))
+  {
+    int scale = 0;
+    float oldRes = kern[scale][0] * buff[tx + LAPLACE_R];
+    // float oldRes = kern_temp[scale * LAPLACE_S + 0] * buff[tx + LAPLACE_R];
+
+    // #pragma unroll
+    for (int j = 1; j <= LAPLACE_R; j++)
+      oldRes += kern[scale][j] * (buff[tx + LAPLACE_R - j] + buff[tx + LAPLACE_R + j]);
+    // oldRes += kern_temp[scale * LAPLACE_S + j] * (buff[tx + LAPLACE_R - j] + buff[tx + LAPLACE_R + j]);
+    for (int scale = 1; scale < LAPLACE_S; scale++)
+    {
+      float *buf = buff + (LAPLACE_W + 2 * LAPLACE_R) * scale;
+      float res = kern[scale][0] * buf[tx + LAPLACE_R];
+      // float res = kern_temp[scale * LAPLACE_S + 0] * buf[tx + LAPLACE_R];
+
+      // #pragma unroll
+      for (int j = 1; j <= LAPLACE_R; j++)
+        res += kern[scale][j] * (buf[tx + LAPLACE_R - j] + buf[tx + LAPLACE_R + j]);
+      // res += kern_temp[scale * LAPLACE_S + j] * (buf[tx + LAPLACE_R - j] + buf[tx + LAPLACE_R + j]);
+      d_Result[(scale - 1) * height * pitch + yp * pitch + xp] = res - oldRes;
+      oldRes = res;
+    }
+  }
+}
+
+void LaplaceMultiMemWide(float *d_Image, float *d_Result, int width, int pitch, int height, int octave,
+                         sycl::nd_item<3> item_ct1, float *d_LaplaceKernel,
+                         float *buff)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+  const int xp = item_ct1.get_group(2) * LAPLACE_W + tx;
+  const int xp4 = item_ct1.get_group(2) * LAPLACE_W + 4 * tx;
+  const int yp = item_ct1.get_group(1);
+  float kern[LAPLACE_S][LAPLACE_R + 1];
+  float *data =
+      d_Image + sycl::max(sycl::min((int)(xp - 4), (int)(width - 1)), 0);
+  float temp[9];
+  if (xp < (width + 2 * LAPLACE_R))
+  {
+    for (int i = 0; i < 4; i++)
+      temp[i] =
+          data[sycl::max(0, sycl::min((int)(yp + i - 4), (int)(height - 1))) *
+               pitch];
+    for (int i = 4; i < 8 + 1; i++)
+      temp[i] = data[sycl::min((int)(yp + i - 4), (int)(height - 1)) * pitch];
+    for (int scale = 0; scale < LAPLACE_S; scale++)
+    {
+      float *kernel = d_LaplaceKernel + octave * 12 * 16 + scale * 16;
+      for (int i = 0; i <= LAPLACE_R; i++)
+        kern[scale][i] = kernel[LAPLACE_R - i];
+      float *buf = buff + (LAPLACE_W + 2 * LAPLACE_R) * scale;
+      buf[tx] = kern[scale][4] * temp[4] +
+                kern[scale][3] * (temp[3] + temp[5]) + kern[scale][2] * (temp[2] + temp[6]) +
+                kern[scale][1] * (temp[1] + temp[7]) + kern[scale][0] * (temp[0] + temp[8]);
+    }
+  }
+
+  item_ct1.barrier();
+  if (tx < LAPLACE_W / 4 && xp4 < width)
+  {
+    sycl::float4 b0 = reinterpret_cast<sycl::float4 *>(buff)[tx + 0];
+    sycl::float4 b1 = reinterpret_cast<sycl::float4 *>(buff)[tx + 1];
+    sycl::float4 b2 = reinterpret_cast<sycl::float4 *>(buff)[tx + 2];
+    sycl::float4 old4, new4, dif4;
+    old4.x() = kern[0][4] * b1.x() + kern[0][3] * (b0.w() + b1.y()) +
+               kern[0][2] * (b0.z() + b1.z()) + kern[0][1] * (b0.y() + b1.w()) +
+               kern[0][0] * (b0.x() + b2.x());
+    old4.y() = kern[0][4] * b1.y() + kern[0][3] * (b1.x() + b1.z()) +
+               kern[0][2] * (b0.w() + b1.w()) + kern[0][1] * (b0.z() + b2.x()) +
+               kern[0][0] * (b0.y() + b2.y());
+    old4.z() = kern[0][4] * b1.z() + kern[0][3] * (b1.y() + b1.w()) +
+               kern[0][2] * (b1.x() + b2.x()) + kern[0][1] * (b0.w() + b2.y()) +
+               kern[0][0] * (b0.z() + b2.z());
+    old4.w() = kern[0][4] * b1.w() + kern[0][3] * (b1.z() + b2.x()) +
+               kern[0][2] * (b1.y() + b2.y()) + kern[0][1] * (b1.x() + b2.z()) +
+               kern[0][0] * (b0.w() + b2.w());
+    for (int scale = 1; scale < LAPLACE_S; scale++)
+    {
+      float *buf = buff + (LAPLACE_W + 2 * LAPLACE_R) * scale;
+      sycl::float4 b0 = reinterpret_cast<sycl::float4 *>(buf)[tx + 0];
+      sycl::float4 b1 = reinterpret_cast<sycl::float4 *>(buf)[tx + 1];
+      sycl::float4 b2 = reinterpret_cast<sycl::float4 *>(buf)[tx + 2];
+      new4.x() = kern[scale][4] * b1.x() + kern[scale][3] * (b0.w() + b1.y()) +
+                 kern[scale][2] * (b0.z() + b1.z()) +
+                 kern[scale][1] * (b0.y() + b1.w()) +
+                 kern[scale][0] * (b0.x() + b2.x());
+      new4.y() = kern[scale][4] * b1.y() + kern[scale][3] * (b1.x() + b1.z()) +
+                 kern[scale][2] * (b0.w() + b1.w()) +
+                 kern[scale][1] * (b0.z() + b2.x()) +
+                 kern[scale][0] * (b0.y() + b2.y());
+      new4.z() = kern[scale][4] * b1.z() + kern[scale][3] * (b1.y() + b1.w()) +
+                 kern[scale][2] * (b1.x() + b2.x()) +
+                 kern[scale][1] * (b0.w() + b2.y()) +
+                 kern[scale][0] * (b0.z() + b2.z());
+      new4.w() = kern[scale][4] * b1.w() + kern[scale][3] * (b1.z() + b2.x()) +
+                 kern[scale][2] * (b1.y() + b2.y()) +
+                 kern[scale][1] * (b1.x() + b2.z()) +
+                 kern[scale][0] * (b0.w() + b2.w());
+      dif4.x() = new4.x() - old4.x();
+      dif4.y() = new4.y() - old4.y();
+      dif4.z() = new4.z() - old4.z();
+      dif4.w() = new4.w() - old4.w();
+      reinterpret_cast<sycl::float4 *>(
+          &d_Result[(scale - 1) * height * pitch + yp * pitch + xp4])[0] = dif4;
+      old4 = new4;
+    }
+  }
+}
+
+void LaplaceMultiMemTest(float *d_Image, float *d_Result, int width, int pitch, int height, int octave,
+                         sycl::nd_item<3> item_ct1, float *d_LaplaceKernel,
+                         float *data1, float *data2)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+  const int xp = item_ct1.get_group(2) * LAPLACE_W + tx;
+  const int yp = LAPLACE_H * item_ct1.get_group(1);
+  const int scale = item_ct1.get_local_id(1);
+  float *kernel = d_LaplaceKernel + octave * 12 * 16 + scale * 16;
+  float *sdata1 = data1 + (LAPLACE_W + 2 * LAPLACE_R) * scale;
+  float *data =
+      d_Image + sycl::max(sycl::min((int)(xp - 4), (int)(width - 1)), 0);
+  int h = height - 1;
+  float temp[8 + LAPLACE_H], kern[LAPLACE_R + 1];
+  for (int i = 0; i < 4; i++)
+    temp[i] = data[sycl::max(0, sycl::min((int)(yp + i - 4), h)) * pitch];
+  for (int i = 4; i < 8 + LAPLACE_H; i++)
+    temp[i] = data[sycl::min((int)(yp + i - 4), h) * pitch];
+  for (int i = 0; i <= LAPLACE_R; i++)
+    kern[i] = kernel[LAPLACE_R - i];
+  for (int j = 0; j < LAPLACE_H; j++)
+  {
+    sdata1[tx] = kern[4] * temp[4 + j] +
+                 kern[3] * (temp[3 + j] + temp[5 + j]) + kern[2] * (temp[2 + j] + temp[6 + j]) +
+                 kern[1] * (temp[1 + j] + temp[7 + j]) + kern[0] * (temp[0 + j] + temp[8 + j]);
+
+    item_ct1.barrier();
+    float *sdata2 = data2 + LAPLACE_W * scale;
+    if (tx < LAPLACE_W)
+    {
+      sdata2[tx] = kern[4] * sdata1[tx + 4] +
+                   kern[3] * (sdata1[tx + 3] + sdata1[tx + 5]) + kern[2] * (sdata1[tx + 2] + sdata1[tx + 6]) +
+                   kern[1] * (sdata1[tx + 1] + sdata1[tx + 7]) + kern[0] * (sdata1[tx + 0] + sdata1[tx + 8]);
+    }
+
+    item_ct1.barrier();
+    if (tx < LAPLACE_W && scale < LAPLACE_S - 1 && xp < width && (yp + j) < height)
+      d_Result[scale * height * pitch + (yp + j) * pitch + xp] = sdata2[tx] - sdata2[tx + LAPLACE_W];
+  }
+}
+
+void LaplaceMultiMemOld(float *d_Image, float *d_Result, int width, int pitch, int height, int octave,
+                        sycl::nd_item<3> item_ct1, float *d_LaplaceKernel,
+                        float *data1, float *data2)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+  const int xp = item_ct1.get_group(2) * LAPLACE_W + tx;
+  const int yp = item_ct1.get_group(1);
+  const int scale = item_ct1.get_local_id(1);
+  float *kernel = d_LaplaceKernel + octave * 12 * 16 + scale * 16;
+  float *sdata1 = data1 + (LAPLACE_W + 2 * LAPLACE_R) * scale;
+  float *data =
+      d_Image + sycl::max(sycl::min((int)(xp - 4), (int)(width - 1)), 0);
+  int h = height - 1;
+  sdata1[tx] =
+      kernel[0] * data[sycl::min(yp, h) * pitch] +
+      kernel[1] * (data[sycl::max(0, sycl::min((int)(yp - 1), h)) * pitch] +
+                   data[sycl::min((int)(yp + 1), h) * pitch]) +
+      kernel[2] * (data[sycl::max(0, sycl::min((int)(yp - 2), h)) * pitch] +
+                   data[sycl::min((int)(yp + 2), h) * pitch]) +
+      kernel[3] * (data[sycl::max(0, sycl::min((int)(yp - 3), h)) * pitch] +
+                   data[sycl::min((int)(yp + 3), h) * pitch]) +
+      kernel[4] * (data[sycl::max(0, sycl::min((int)(yp - 4), h)) * pitch] +
+                   data[sycl::min((int)(yp + 4), h) * pitch]);
+
+  item_ct1.barrier();
+  float *sdata2 = data2 + LAPLACE_W * scale;
+  if (tx < LAPLACE_W)
+  {
+    sdata2[tx] = kernel[0] * sdata1[tx + 4] +
+                 kernel[1] * (sdata1[tx + 3] + sdata1[tx + 5]) +
+                 kernel[2] * (sdata1[tx + 2] + sdata1[tx + 6]) +
+                 kernel[3] * (sdata1[tx + 1] + sdata1[tx + 7]) +
+                 kernel[4] * (sdata1[tx + 0] + sdata1[tx + 8]);
+  }
+
+  item_ct1.barrier();
+  if (tx < LAPLACE_W && scale < LAPLACE_S - 1 && xp < width)
+    d_Result[scale * height * pitch + yp * pitch + xp] = sdata2[tx] - sdata2[tx + LAPLACE_W];
+}
+
+void LowPass(float *d_Image, float *d_Result, int width, int pitch, int height,
+             sycl::nd_item<3> item_ct1, float *d_LowPassKernel, float *buffer)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int xp = item_ct1.get_group(2) * LOWPASS_W + tx;
+  const int yp = item_ct1.get_group(1) * LOWPASS_H + ty;
+  float *kernel = d_LowPassKernel;
+  float *data =
+      d_Image + sycl::max(sycl::min((int)(xp - 4), (int)(width - 1)), 0);
+  float *buff = buffer + ty * (LOWPASS_W + 2 * LOWPASS_R);
+  int h = height - 1;
+  if (yp < height)
+    buff[tx] =
+        kernel[4] * data[sycl::min(yp, h) * pitch] +
+        kernel[3] * (data[sycl::max(0, sycl::min((int)(yp - 1), h)) * pitch] +
+                     data[sycl::min((int)(yp + 1), h) * pitch]) +
+        kernel[2] * (data[sycl::max(0, sycl::min((int)(yp - 2), h)) * pitch] +
+                     data[sycl::min((int)(yp + 2), h) * pitch]) +
+        kernel[1] * (data[sycl::max(0, sycl::min((int)(yp - 3), h)) * pitch] +
+                     data[sycl::min((int)(yp + 3), h) * pitch]) +
+        kernel[0] * (data[sycl::max(0, sycl::min((int)(yp - 4), h)) * pitch] +
+                     data[sycl::min((int)(yp + 4), h) * pitch]);
+
+  item_ct1.barrier();
+  if (tx < LOWPASS_W && xp < width && yp < height)
+    d_Result[yp * pitch + xp] = kernel[4] * buff[tx + 4] +
+                                kernel[3] * (buff[tx + 3] + buff[tx + 5]) + kernel[2] * (buff[tx + 2] + buff[tx + 6]) +
+                                kernel[1] * (buff[tx + 1] + buff[tx + 7]) + kernel[0] * (buff[tx + 0] + buff[tx + 8]);
+}
+
+void LowPassBlockOld(float *d_Image, float *d_Result, int width, int pitch, int height,
+                     sycl::nd_item<3> item_ct1, float *d_LowPassKernel,
+                     sycl::accessor<float, 2, sycl::access_mode::read_write, sycl::access::target::local> xrows)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int xp = item_ct1.get_group(2) * LOWPASS_W + tx;
+  const int yp = item_ct1.get_group(1) * LOWPASS_H + ty;
+  const int N = 16;
+  float *k = d_LowPassKernel;
+  int xl = sycl::max(sycl::min((int)(xp - 4), (int)(width - 1)), 0);
+  for (int l = -8; l <= LOWPASS_H; l += 4)
+  {
+    if (l < LOWPASS_H)
+    {
+      int yl = sycl::max(sycl::min((int)(yp + l + 4), (int)(height - 1)), 0);
+      float val = d_Image[yl * pitch + xl];
+      xrows[(l + 8 + ty) % N][tx] =
+          k[4] * ShiftDown(val, 4, item_ct1) +
+          k[3] * (ShiftDown(val, 5, item_ct1) + ShiftDown(val, 3, item_ct1)) +
+          k[2] * (ShiftDown(val, 6, item_ct1) + ShiftDown(val, 2, item_ct1)) +
+          k[1] * (ShiftDown(val, 7, item_ct1) + ShiftDown(val, 1, item_ct1)) +
+          k[0] * (ShiftDown(val, 8, item_ct1) + val);
+    }
+    if (l >= 4)
+    {
+      int ys = yp + l - 4;
+      if (xp < width && ys < height && tx < LOWPASS_W)
+        d_Result[ys * pitch + xp] = k[4] * xrows[(l + 0 + ty) % N][tx] +
+                                    k[3] * (xrows[(l - 1 + ty) % N][tx] + xrows[(l + 1 + ty) % N][tx]) +
+                                    k[2] * (xrows[(l - 2 + ty) % N][tx] + xrows[(l + 2 + ty) % N][tx]) +
+                                    k[1] * (xrows[(l - 3 + ty) % N][tx] + xrows[(l + 3 + ty) % N][tx]) +
+                                    k[0] * (xrows[(l - 4 + ty) % N][tx] + xrows[(l + 4 + ty) % N][tx]);
+    }
+    if (l >= 0)
+
+      // item_ct1.barrier();
+      item_ct1.barrier(sycl::access::fence_space::local_space);
+  }
+}
+
+void LowPassBlock(float *d_Image, float *d_Result, int width, int pitch, int height,
+                  sycl::nd_item<3> item_ct1, float *d_LowPassKernel,
+                  sycl::accessor<float, 2, sycl::access_mode::read_write, sycl::access::target::local> xrows)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int xp = item_ct1.get_group(2) * LOWPASS_W + tx;
+  const int yp = item_ct1.get_group(1) * LOWPASS_H + ty;
+  const int N = 16;
+  float *k = d_LowPassKernel;
+  int xl = sycl::max(sycl::min((int)(xp - 4), (int)(width - 1)), 0);
+  // #pragma unroll
+  for (int l = -8; l < 4; l += 4)
+  {
+    int ly = l + ty;
+    int yl = sycl::max(sycl::min((int)(yp + l + 4), (int)(height - 1)), 0);
+    float val = d_Image[yl * pitch + xl];
+    val = k[4] * ShiftDown(val, 4, item_ct1) +
+          k[3] * (ShiftDown(val, 5, item_ct1) + ShiftDown(val, 3, item_ct1)) +
+          k[2] * (ShiftDown(val, 6, item_ct1) + ShiftDown(val, 2, item_ct1)) +
+          k[1] * (ShiftDown(val, 7, item_ct1) + ShiftDown(val, 1, item_ct1)) +
+          k[0] * (ShiftDown(val, 8, item_ct1) + val);
+    xrows[ly + 8][tx] = val;
+  }
+
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+  // #pragma unroll
+  for (int l = 4; l < LOWPASS_H; l += 4)
+  {
+    int ly = l + ty;
+    int yl = sycl::min((int)(yp + l + 4), (int)(height - 1));
+    float val = d_Image[yl * pitch + xl];
+    val = k[4] * ShiftDown(val, 4, item_ct1) +
+          k[3] * (ShiftDown(val, 5, item_ct1) + ShiftDown(val, 3, item_ct1)) +
+          k[2] * (ShiftDown(val, 6, item_ct1) + ShiftDown(val, 2, item_ct1)) +
+          k[1] * (ShiftDown(val, 7, item_ct1) + ShiftDown(val, 1, item_ct1)) +
+          k[0] * (ShiftDown(val, 8, item_ct1) + val);
+    xrows[(ly + 8) % N][tx] = val;
+    int ys = yp + l - 4;
+    if (xp < width && ys < height && tx < LOWPASS_W)
+      d_Result[ys * pitch + xp] = k[4] * xrows[(ly + 0) % N][tx] +
+                                  k[3] * (xrows[(ly - 1) % N][tx] + xrows[(ly + 1) % N][tx]) +
+                                  k[2] * (xrows[(ly - 2) % N][tx] + xrows[(ly + 2) % N][tx]) +
+                                  k[1] * (xrows[(ly - 3) % N][tx] + xrows[(ly + 3) % N][tx]) +
+                                  k[0] * (xrows[(ly - 4) % N][tx] + xrows[(ly + 4) % N][tx]);
+
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+  }
+  int ly = LOWPASS_H + ty;
+  int ys = yp + LOWPASS_H - 4;
+  if (xp < width && ys < height && tx < LOWPASS_W)
+    d_Result[ys * pitch + xp] = k[4] * xrows[(ly + 0) % N][tx] +
+                                k[3] * (xrows[(ly - 1) % N][tx] + xrows[(ly + 1) % N][tx]) +
+                                k[2] * (xrows[(ly - 2) % N][tx] + xrows[(ly + 2) % N][tx]) +
+                                k[1] * (xrows[(ly - 3) % N][tx] + xrows[(ly + 3) % N][tx]) +
+                                k[0] * (xrows[(ly - 4) % N][tx] + xrows[(ly + 4) % N][tx]);
+}
diff --git a/cudaSiftD.h b/src/cudaSiftD.h
similarity index 100%
rename from cudaSiftD.h
rename to src/cudaSiftD.h
diff --git a/src/cudaSiftH.dp.cpp b/src/cudaSiftH.dp.cpp
new file mode 100644
index 0000000..d515eb5
--- /dev/null
+++ b/src/cudaSiftH.dp.cpp
@@ -0,0 +1,806 @@
+// Modifications Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+
+#include <sycl/sycl.hpp>
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include <iostream>
+#include <algorithm>
+
+#include "cudautils.h"
+#include "cudaImage.h"
+#include "cudaSift.h"
+#include "cudaSiftD.h"
+#include "cudaSiftH.h"
+#include "cudaSiftD.dp.cpp"
+
+#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
+
+template <>
+struct sycl::is_device_copyable<CudaImage> : std::true_type
+{
+};
+
+void InitCuda(sycl::queue &q_ct, int devNum)
+{
+  auto device = q_ct.get_device();
+  std::cout << "Device Name:          " << device.get_info<sycl::info::device::name>() << std::endl;
+  std::cout << "Max workgroup size:   " << device.get_info<sycl::info::device::max_work_group_size>() << std::endl;
+  std::cout << "Max clock freq:   " << device.get_info<sycl::info::device::max_clock_frequency>() << std::endl;
+}
+
+float *AllocSiftTempMemory(int width, int height, int numOctaves, sycl::queue &q_ct, float &time, bool scaleUp)
+{
+  const int nd = NUM_SCALES + 3;
+  int w = width * (scaleUp ? 2 : 1);
+  int h = height * (scaleUp ? 2 : 1);
+  int p = iAlignUp(w, 128);
+  int size = h * p;         // image sizes
+  int sizeTmp = nd * h * p; // laplace buffer sizes
+  for (int i = 0; i < numOctaves; i++)
+  {
+    w /= 2;
+    h /= 2;
+    int p = iAlignUp(w, 128);
+    size += h * p;
+    sizeTmp += nd * h * p;
+  }
+  float *memoryTmp = NULL;
+  size_t pitch;
+  size += sizeTmp;
+
+#ifdef DEVICE_TIMER
+  auto start_malloc = std::chrono::steady_clock::now();
+#endif
+  memoryTmp = (float *)infra::sift_malloc(pitch, (size_t)4096, (size + 4095) / 4096 * sizeof(float), q_ct);
+  q_ct.wait();
+#ifdef DEVICE_TIMER
+  auto stop_malloc = std::chrono::steady_clock::now();
+  // printf("Malloc time for memoryTmp =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_malloc - start_malloc).count());
+  time += std::chrono::duration<float, std::micro>(stop_malloc - start_malloc).count();
+#endif
+  return memoryTmp;
+}
+
+void FreeSiftTempMemory(float *memoryTmp, sycl::queue &q_ct)
+{
+  if (memoryTmp)
+
+    safeCall((sycl::free(memoryTmp, q_ct), 0));
+}
+
+void ExtractSift(SiftData &siftData, CudaImage &img, int numOctaves, double initBlur, float thresh, sycl::queue &q_ct,
+                 float &totTime, float lowestScale, bool scaleUp, float *tempMemory)
+{
+  unsigned int *d_PointCounterAddr;
+
+#ifdef DEVICE_TIMER
+  auto start_memcpy = std::chrono::steady_clock::now();
+#endif
+  *((void **)&d_PointCounterAddr) = d_PointCounter.get_ptr();
+  q_ct.memset(d_PointCounterAddr, 0, (8 * 2 + 1) * sizeof(int));
+  q_ct.memcpy(d_MaxNumPoints.get_ptr(), &siftData.maxPts, sizeof(int));
+  q_ct.wait();
+
+#ifdef DEVICE_TIMER
+  auto stop_memcpy = std::chrono::steady_clock::now();
+  totTime += std::chrono::duration<float, std::micro>(stop_memcpy - start_memcpy).count();
+#endif
+
+  const int nd = NUM_SCALES + 3;
+  int w = img.width * (scaleUp ? 2 : 1);
+  int h = img.height * (scaleUp ? 2 : 1);
+  int p = iAlignUp(w, 128);
+  int width = w, height = h;
+  int size = h * p;         // image sizes
+  int sizeTmp = nd * h * p; // laplace buffer sizes
+  for (int i = 0; i < numOctaves; i++)
+  {
+    w /= 2;
+    h /= 2;
+    int p = iAlignUp(w, 128);
+    size += h * p;
+    sizeTmp += nd * h * p;
+  }
+  float *memoryTmp = tempMemory;
+  size += sizeTmp;
+  if (!tempMemory)
+  {
+    size_t pitch;
+#ifdef DEVICE_TIMER
+    auto start_malloc2 = std::chrono::steady_clock::now();
+#endif
+    memoryTmp = (float *)infra::sift_malloc(pitch, (size_t)4096, (size + 4095) / 4096 * sizeof(float), q_ct);
+    q_ct.wait();
+
+#ifdef DEVICE_TIMER
+    auto stop_malloc2 = std::chrono::steady_clock::now();
+    // printf("Malloc time for memoryTmp =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_malloc - start_malloc).count());
+    totTime += std::chrono::duration<float, std::micro>(stop_malloc2 - start_malloc2).count();
+#endif
+  }
+  float *memorySub = memoryTmp + sizeTmp;
+
+  CudaImage lowImg;
+  lowImg.Allocate(width, height, iAlignUp(width, 128), false, q_ct, totTime, memorySub);
+  if (!scaleUp)
+  {
+    float kernel[8 * 12 * 16];
+    PrepareLaplaceKernels(numOctaves, 0.0f, kernel);
+#ifdef DEVICE_TIMER
+    auto start_memcpy1 = std::chrono::steady_clock::now();
+#endif
+    q_ct.memcpy(d_LaplaceKernel.get_ptr(), kernel, 8 * 12 * 16 * sizeof(float));
+    q_ct.wait();
+
+#ifdef DEVICE_TIMER
+    auto stop_memcpy1 = std::chrono::steady_clock::now();
+    totTime += std::chrono::duration<float, std::micro>(stop_memcpy1 - start_memcpy1).count();
+#endif
+
+    LowPass(lowImg, img, fmax(initBlur, 0.001f), q_ct, totTime);
+
+    ExtractSiftLoop(siftData, lowImg, numOctaves, 0.0f, thresh, lowestScale, 1.0f, memoryTmp,
+                    memorySub + height * iAlignUp(width, 128), q_ct, totTime);
+
+#ifdef DEVICE_TIMER
+    auto start_memcpy2 = std::chrono::steady_clock::now();
+#endif
+    q_ct.memcpy(&siftData.numPts, &d_PointCounterAddr[2 * numOctaves], sizeof(int));
+    q_ct.wait();
+#ifdef DEVICE_TIMER
+    auto stop_memcpy2 = std::chrono::steady_clock::now();
+    totTime += std::chrono::duration<float, std::micro>(stop_memcpy2 - start_memcpy2).count();
+#endif
+    siftData.numPts = (siftData.numPts < siftData.maxPts ? siftData.numPts : siftData.maxPts);
+  }
+  else
+  {
+    CudaImage upImg;
+    upImg.Allocate(width, height, iAlignUp(width, 128), false, q_ct, totTime, memoryTmp);
+    ScaleUp(upImg, img, q_ct, totTime);
+    LowPass(lowImg, upImg, fmax(initBlur, 0.001f), q_ct, totTime);
+    float kernel[8 * 12 * 16];
+    PrepareLaplaceKernels(numOctaves, 0.0f, kernel);
+#ifdef DEVICE_TIMER
+    auto start_memcpy3 = std::chrono::steady_clock::now();
+#endif
+    safeCall(
+        (q_ct.memcpy(d_LaplaceKernel.get_ptr(), kernel,
+                     8 * 12 * 16 * sizeof(float)),
+         0));
+    q_ct.wait();
+#ifdef DEVICE_TIMER
+    auto stop_memcpy3 = std::chrono::steady_clock::now();
+    totTime += std::chrono::duration<float, std::micro>(stop_memcpy3 - start_memcpy3).count();
+#endif
+    ExtractSiftLoop(siftData, lowImg, numOctaves, 0.0f, thresh, lowestScale * 2.0f, 1.0f, memoryTmp,
+                    memorySub + height * iAlignUp(width, 128), q_ct, totTime);
+#ifdef DEVICE_TIMER
+    auto start_memcpy4 = std::chrono::steady_clock::now();
+#endif
+    safeCall((q_ct.memcpy(&siftData.numPts, &d_PointCounterAddr[2 * numOctaves],
+                          sizeof(int)),
+              0));
+    q_ct.wait();
+#ifdef DEVICE_TIMER
+    auto stop_memcpy4 = std::chrono::steady_clock::now();
+    totTime += std::chrono::duration<float, std::micro>(stop_memcpy4 - start_memcpy4).count();
+#endif
+    siftData.numPts = (siftData.numPts < siftData.maxPts ? siftData.numPts : siftData.maxPts);
+    RescalePositions(siftData, 0.5f, q_ct, totTime);
+  }
+
+  if (!tempMemory)
+    safeCall((sycl::free(memoryTmp, q_ct), 0));
+  if (siftData.h_data)
+  {
+#ifdef DEVICE_TIMER
+    auto start_memcpy5 = std::chrono::steady_clock::now();
+#endif
+    q_ct.memcpy(siftData.h_data, siftData.d_data, sizeof(SiftPoint) * siftData.numPts);
+    q_ct.wait();
+#ifdef DEVICE_TIMER
+    auto stop_memcpy5 = std::chrono::steady_clock::now();
+    totTime += std::chrono::duration<float, std::micro>(stop_memcpy5 - start_memcpy5).count();
+    printf("Total time for sift extraction =  %.2f us\n\n", totTime);
+#endif
+    printf("Number of Points after sift extraction =  %d\n\n", siftData.numPts);
+  }
+}
+
+int ExtractSiftLoop(SiftData &siftData, CudaImage &img, int numOctaves, double initBlur, float thresh, float lowestScale,
+                    float subsampling, float *memoryTmp, float *memorySub, sycl::queue &q_ct, float &totTime)
+{
+  int w = img.width;
+  int h = img.height;
+  if (numOctaves > 1)
+  {
+    CudaImage subImg;
+    int p = iAlignUp(w / 2, 128);
+    subImg.Allocate(w / 2, h / 2, p, false, q_ct, totTime, memorySub);
+    ScaleDown(subImg, img, 0.5f, q_ct, totTime);
+    float totInitBlur = (float)sqrt(initBlur * initBlur + 0.5f * 0.5f) / 2.0f;
+    ExtractSiftLoop(siftData, subImg, numOctaves - 1, totInitBlur, thresh, lowestScale,
+                    subsampling * 2.0f, memoryTmp, memorySub + (h / 2) * p, q_ct, totTime);
+  }
+  ExtractSiftOctave(siftData, img, numOctaves, thresh, lowestScale, subsampling, memoryTmp, q_ct, totTime);
+  return 0;
+}
+
+void c1toc4(float *f_ptr, sycl::float4 *f4_ptr, int width, int height,
+            int f_pitch, int f4_pitch, sycl::id<2> idx)
+{
+  const int workItm_row = idx[0];
+  const int workItm_col = idx[1];
+  float *f_row_begin = f_ptr + f_pitch * workItm_row;
+  sycl::float4 *f4_row_begin = f4_ptr + f4_pitch * workItm_row;
+
+  f4_row_begin[workItm_col].x() = f_row_begin[workItm_col];
+}
+
+void ExtractSiftOctave(SiftData &siftData, CudaImage &img, int octave, float thresh, float lowestScale,
+                       float subsampling, float *memoryTmp, sycl::queue &q_ct, float &totTime)
+{
+  const int nd = NUM_SCALES + 3;
+  CudaImage diffImg[nd];
+  int w = img.width;
+  int h = img.height;
+  int p = iAlignUp(w, 128);
+  for (int i = 0; i < nd - 1; i++)
+    diffImg[i].Allocate(w, h, p, false, q_ct, totTime, memoryTmp + i * p * h);
+  float baseBlur = pow(2.0f, -1.0f / NUM_SCALES);
+  float diffScale = pow(2.0f, 1.0f / NUM_SCALES);
+  LaplaceMulti(img, diffImg, octave, q_ct, totTime);
+  FindPointsMulti(diffImg, siftData, thresh, 10.0f, 1.0f / NUM_SCALES, lowestScale / subsampling, subsampling, octave, q_ct, totTime);
+  ComputeOrientations(img, siftData, octave, q_ct, totTime);
+  ExtractSiftDescriptors(img.d_data, img.pitch, siftData, subsampling, octave, q_ct, totTime);
+}
+
+void InitSiftData(SiftData &data, sycl::queue &q_ct, float &time, int num, bool host, bool dev)
+{
+  data.numPts = 0;
+  data.maxPts = num;
+  int sz = sizeof(SiftPoint) * num;
+  data.h_data = NULL;
+  if (host)
+    data.h_data = (SiftPoint *)malloc(sz);
+  data.d_data = NULL;
+  if (dev)
+  {
+#ifdef DEVICE_TIMER
+    auto start_malloc = std::chrono::steady_clock::now();
+#endif
+    data.d_data = (SiftPoint *)sycl::malloc_device(sz, q_ct);
+    q_ct.wait();
+#ifdef DEVICE_TIMER
+    auto stop_malloc = std::chrono::steady_clock::now();
+    time += std::chrono::duration<float, std::micro>(stop_malloc - start_malloc).count();
+#endif
+  }
+}
+
+void FreeSiftData(SiftData &data, sycl::queue &q_ct)
+{
+  if (data.d_data != NULL)
+    sycl::free(data.d_data, q_ct.get_context());
+  data.d_data = NULL;
+  if (data.h_data != NULL)
+    free(data.h_data);
+  data.numPts = 0;
+  data.maxPts = 0;
+}
+
+void PrintSiftData(SiftData &data, sycl::queue &q_ct)
+{
+  SiftPoint *h_data = data.h_data;
+  if (data.h_data == NULL)
+  {
+    h_data = (SiftPoint *)malloc(sizeof(SiftPoint) * data.maxPts);
+    q_ct.memcpy(h_data, data.d_data, sizeof(SiftPoint) * data.numPts)
+        .wait();
+    data.h_data = h_data;
+  }
+  for (int i = 0; i < data.numPts; i++)
+  {
+    printf("xpos         = %.2f\n", h_data[i].xpos);
+    printf("ypos         = %.2f\n", h_data[i].ypos);
+    printf("scale        = %.2f\n", h_data[i].scale);
+    printf("sharpness    = %.2f\n", h_data[i].sharpness);
+    printf("edgeness     = %.2f\n", h_data[i].edgeness);
+    printf("orientation  = %.2f\n", h_data[i].orientation);
+    printf("score        = %.2f\n", h_data[i].score);
+    float *siftData = (float *)&h_data[i].data;
+    for (int j = 0; j < 8; j++)
+    {
+      if (j == 0)
+        printf("data = ");
+      else
+        printf("       ");
+      for (int k = 0; k < 16; k++)
+        if (siftData[j + 8 * k] < 0.05)
+          printf(" .   ");
+        else
+          printf("%.2f ", siftData[j + 8 * k]);
+      printf("\n");
+    }
+  }
+  printf("Number of available points: %d\n", data.numPts);
+  printf("Number of allocated points: %d\n", data.maxPts);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Host side master functions
+///////////////////////////////////////////////////////////////////////////////
+
+double ScaleDown(CudaImage &res, CudaImage &src, float variance, sycl::queue &q_ct, float &totTime)
+{
+  static float oldVariance = -1.0f;
+  if (res.d_data == NULL || src.d_data == NULL)
+  {
+    printf("ScaleDown: missing data\n");
+    return 0.0;
+  }
+  if (oldVariance != variance)
+  {
+    float h_Kernel[5];
+    float kernelSum = 0.0f;
+    for (int j = 0; j < 5; j++)
+    {
+      h_Kernel[j] = (float)expf(-(double)(j - 2) * (j - 2) / 2.0 / variance);
+      kernelSum += h_Kernel[j];
+    }
+    for (int j = 0; j < 5; j++)
+      h_Kernel[j] /= kernelSum;
+
+#ifdef DEVICE_TIMER
+    auto start_memcpy = std::chrono::steady_clock::now();
+#endif
+    q_ct.memcpy(d_ScaleDownKernel.get_ptr(), h_Kernel, 5 * sizeof(float)).wait();
+#ifdef DEVICE_TIMER
+    auto stop_memcpy = std::chrono::steady_clock::now();
+    totTime += std::chrono::duration<float, std::micro>(stop_memcpy - start_memcpy).count();
+#endif
+    oldVariance = variance;
+  }
+#if 0
+  dim3 blocks(iDivUp(src.width, SCALEDOWN_W), iDivUp(src.height, SCALEDOWN_H));
+  dim3 threads(SCALEDOWN_W + 4, SCALEDOWN_H + 4);
+#else
+  sycl::range<3> blocks(1, iDivUp(src.height, SCALEDOWN_H),
+                        iDivUp(src.width, SCALEDOWN_W));
+  sycl::range<3> threads(1, 1, SCALEDOWN_W + 4);
+
+#ifdef DEVICE_TIMER
+  auto start_kernel = std::chrono::steady_clock::now();
+#endif
+  q_ct.submit([&](sycl::handler &cgh)
+              {
+                                     d_ScaleDownKernel.init();
+
+                                     auto d_ScaleDownKernel_ptr_ct1 = d_ScaleDownKernel.get_ptr();
+
+                                     sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                    sycl::access::target::local>
+                                         inrow_acc_ct1(sycl::range<1>(68 /*SCALEDOWN_W+4*/), cgh);
+                                     sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                    sycl::access::target::local>
+                                         brow_acc_ct1(sycl::range<1>(160 /*5*(SCALEDOWN_W/2)*/), cgh);
+                                     sycl::accessor<int, 1, sycl::access_mode::read_write,
+                                                    sycl::access::target::local>
+                                         yRead_acc_ct1(sycl::range<1>(20 /*SCALEDOWN_H+4*/), cgh);
+                                     sycl::accessor<int, 1, sycl::access_mode::read_write,
+                                                    sycl::access::target::local>
+                                         yWrite_acc_ct1(sycl::range<1>(20 /*SCALEDOWN_H+4*/), cgh);
+
+                                     auto res_data_ct1 = res.d_data;
+                                     auto src_data_ct1 = src.d_data;
+                                     auto src_width = src.width;
+                                     auto src_pitch = src.pitch;
+                                     auto src_height = src.height;
+                                     auto res_pitch = res.pitch;
+
+                                     cgh.parallel_for(
+                                         sycl::nd_range<3>(blocks * threads, threads),
+                                         [=](sycl::nd_item<3> item_ct1)[[intel::reqd_sub_group_size(32)]]
+                                         {                                           
+                                           ScaleDown(res_data_ct1, src_data_ct1, src_width, src_pitch, src_height,
+                                                     res_pitch, item_ct1, d_ScaleDownKernel_ptr_ct1,
+                                                     inrow_acc_ct1.get_pointer(), brow_acc_ct1.get_pointer(),
+                                                     yRead_acc_ct1.get_pointer(), yWrite_acc_ct1.get_pointer());
+                                         }); })
+      .wait();
+#ifdef DEVICE_TIMER
+  auto stop_kernel = std::chrono::steady_clock::now();
+  // printf("ScaleDown time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count());
+  totTime += std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count();
+#endif
+#endif
+  checkMsg("ScaleDown() execution failed\n");
+  return 0.0;
+}
+
+double ScaleUp(CudaImage &res, CudaImage &src, sycl::queue &q_ct, float &totTime)
+{
+  if (res.d_data == NULL || src.d_data == NULL)
+  {
+    printf("ScaleUp: missing data\n");
+    return 0.0;
+  }
+  sycl::range<3> blocks(1, iDivUp(res.height, SCALEUP_H),
+                        iDivUp(res.width, SCALEUP_W));
+  sycl::range<3> threads(1, SCALEUP_H / 2, SCALEUP_W / 2);
+
+#ifdef DEVICE_TIMER
+  auto start_kernel = std::chrono::steady_clock::now();
+#endif
+
+  q_ct.submit([&](sycl::handler &cgh)
+              {
+                                     auto src_data_ct1 = src.d_data;
+                                     auto res_data_ct1 = res.d_data;
+                                     auto src_width = src.width;
+                                     auto src_pitch = src.pitch;
+                                     auto src_height = src.height;
+                                     auto res_pitch = res.pitch;
+                                     cgh.parallel_for(
+                                         sycl::nd_range<3>(blocks * threads, threads),
+                                         [=](sycl::nd_item<3> item_ct1)[[intel::reqd_sub_group_size(32)]]
+                                         {                                           
+                                           ScaleUp(res_data_ct1, src_data_ct1, src_width, src_pitch, src_height,
+                                                   res_pitch, item_ct1);
+                                         }); })
+      .wait();
+
+#ifdef DEVICE_TIMER
+  auto stop_kernel = std::chrono::steady_clock::now();
+  // printf("ScaleUp time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count());
+  totTime += std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count();
+#endif
+  checkMsg("ScaleUp() execution failed\n");
+  return 0.0;
+}
+
+double ComputeOrientations(CudaImage &src, SiftData &siftData, int octave, sycl::queue &q_ct, float &totTime)
+{
+  sycl::range<3> blocks(1, 1, 512);
+  sycl::range<3> threads(1, 1, 256);
+#ifdef DEVICE_TIMER
+  auto start_kernel = std::chrono::steady_clock::now();
+#endif
+  q_ct.submit([&](sycl::handler &cgh)
+              {
+
+                auto d_MaxNumPoints_ptr_ct1 = d_MaxNumPoints.get_ptr();                
+                auto d_PointCounter_ptr_ct1 = d_PointCounter.get_ptr();
+
+                sycl::accessor<float, 2, sycl::access_mode::read_write,
+                                sycl::access::target::local>
+                    img_acc_ct1(sycl::range<2>(19 /*WID*/, 19 /*WID*/), cgh);
+                sycl::accessor<float, 2, sycl::access_mode::read_write,
+                                sycl::access::target::local>
+                    tmp_acc_ct1(sycl::range<2>(19 /*WID*/, 19 /*WID*/), cgh);
+                sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                sycl::access::target::local>
+                    hist_acc_ct1(sycl::range<1>(64 /*2*LEN*/), cgh);
+                sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                sycl::access::target::local>
+                    gaussx_acc_ct1(sycl::range<1>(19 /*WID*/), cgh);
+                sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                sycl::access::target::local>
+                    gaussy_acc_ct1(sycl::range<1>(19 /*WID*/), cgh);
+
+                auto src_data_ct1 = src.d_data;
+                auto src_width = src.width;
+                auto src_pitch = src.pitch;
+                auto src_height = src.height;
+                auto siftData_data_ct1 = siftData.d_data;
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(blocks * threads, threads),
+                    [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                      ComputeOrientationsCONSTNew(
+                          src_data_ct1, src_width, src_pitch, src_height, siftData_data_ct1,
+                          octave, item_ct1, *d_MaxNumPoints_ptr_ct1, d_PointCounter_ptr_ct1,
+                          img_acc_ct1, tmp_acc_ct1, hist_acc_ct1.get_pointer(),
+                          gaussx_acc_ct1.get_pointer(), gaussy_acc_ct1.get_pointer());
+                    }); })
+      .wait();
+#ifdef DEVICE_TIMER
+  auto stop_kernel = std::chrono::steady_clock::now();
+  // printf("ComputeOrientationsCONSTNew time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count());
+  totTime += std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count();
+#endif
+  checkMsg("ComputeOrientations() execution failed\n");
+  return 0.0;
+}
+
+double ExtractSiftDescriptors(float *texObj, int pitch, SiftData &siftData, float subsampling, int octave, sycl::queue &q_ct, float &totTime)
+{
+  sycl::range<3> blocks(1, 1, 512);
+  sycl::range<3> threads(1, 8, 16);
+#ifdef DEVICE_TIMER
+  auto start_kernel = std::chrono::steady_clock::now();
+#endif
+  q_ct.submit([&](sycl::handler &cgh)
+              {
+                                     d_MaxNumPoints.init();
+                                     d_PointCounter.init();
+
+                                     auto d_MaxNumPoints_ptr_ct1 = d_MaxNumPoints.get_ptr();
+                                     auto d_PointCounter_ptr_ct1 = d_PointCounter.get_ptr();
+
+                                     sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                    sycl::access::target::local>
+                                         gauss_acc_ct1(sycl::range<1>(16), cgh);
+                                     sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                    sycl::access::target::local>
+                                         buffer_acc_ct1(sycl::range<1>(128), cgh);
+                                     sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                    sycl::access::target::local>
+                                         sums_acc_ct1(sycl::range<1>(4), cgh);
+
+                                     auto siftData_data_ct1 = siftData.d_data;
+
+                                     cgh.parallel_for(
+                                         sycl::nd_range<3>(blocks * threads, threads), [=
+                                     ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
+                                                                                           32)]] {
+                                           ExtractSiftDescriptorsCONSTNew(                                              
+                                               texObj, pitch,
+                                               siftData_data_ct1, subsampling, octave, item_ct1,
+                                               *d_MaxNumPoints_ptr_ct1, d_PointCounter_ptr_ct1,
+                                               gauss_acc_ct1.get_pointer(), buffer_acc_ct1.get_pointer(),
+                                               sums_acc_ct1.get_pointer());
+                                         }); })
+      .wait();
+
+#ifdef DEVICE_TIMER
+  auto stop_kernel = std::chrono::steady_clock::now();
+  // printf("ExtractSiftDescriptorsCONSTNew time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count());
+  totTime += std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count();
+#endif
+  checkMsg("ExtractSiftDescriptors() execution failed\n");
+  return 0.0;
+}
+
+double RescalePositions(SiftData &siftData, float scale, sycl::queue &q_ct, float &totTime)
+{
+  sycl::range<3> blocks(1, 1, iDivUp(siftData.numPts, 64));
+  sycl::range<3> threads(1, 1, 64);
+#ifdef DEVICE_TIMER
+  auto start_kernel = std::chrono::steady_clock::now();
+#endif
+  q_ct.submit([&](sycl::handler &cgh)
+              {
+                                     auto siftData_data_ct1 = siftData.d_data;
+                                     auto sifData_numPts = siftData.numPts;
+                                     cgh.parallel_for(
+                                         sycl::nd_range<3>(blocks * threads, threads),
+                                         [=](sycl::nd_item<3> item_ct1)[[intel::reqd_sub_group_size(32)]]
+                                         {
+                                           RescalePositions(siftData_data_ct1, sifData_numPts, scale, item_ct1);
+                                         }); })
+      .wait();
+#ifdef DEVICE_TIMER
+  auto stop_kernel = std::chrono::steady_clock::now();
+  // printf("RescalePositions time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count());
+  totTime += std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count();
+#endif
+  checkMsg("RescapePositions() execution failed\n");
+  return 0.0;
+}
+
+double LowPass(CudaImage &res, CudaImage &src, float scale, sycl::queue &q_ct, float &totTime)
+{
+  try
+  {
+    float kernel[2 * LOWPASS_R + 1];
+    static float oldScale = -1.0f;
+    if (scale != oldScale)
+    {
+      float kernelSum = 0.0f;
+      float ivar2 = 1.0f / (2.0f * scale * scale);
+      for (int j = -LOWPASS_R; j <= LOWPASS_R; j++)
+      {
+        kernel[j + LOWPASS_R] = (float)expf(-(double)j * j * ivar2);
+        kernelSum += kernel[j + LOWPASS_R];
+      }
+      for (int j = -LOWPASS_R; j <= LOWPASS_R; j++)
+        kernel[j + LOWPASS_R] /= kernelSum;
+
+#ifdef DEVICE_TIMER
+      auto start_memcpy_1 = std::chrono::steady_clock::now();
+#endif
+      q_ct.memcpy(d_LowPassKernel.get_ptr(), kernel,
+                  (2 * LOWPASS_R + 1) * sizeof(float));
+      q_ct.wait();
+#ifdef DEVICE_TIMER
+      auto stop_memcpy_1 = std::chrono::steady_clock::now();
+      totTime += std::chrono::duration<float, std::micro>(stop_memcpy_1 - start_memcpy_1).count();
+#endif
+      oldScale = scale;
+    }
+    int width = res.width;
+    int pitch = res.pitch;
+    int height = res.height;
+    sycl::range<3> blocks(1, iDivUp(height, LOWPASS_H), iDivUp(width, LOWPASS_W)); //(1, 34, 80)
+    sycl::range<3> threads(1, 4, LOWPASS_W + 2 * LOWPASS_R);                       //(1, 4, 32)
+
+#ifdef DEVICE_TIMER
+    auto start_kernel = std::chrono::steady_clock::now();
+#endif
+    q_ct.submit([&](sycl::handler &cgh)
+                {                                    
+                                     auto d_LowPassKernel_ptr_ct1 = d_LowPassKernel.get_ptr();
+
+                                     auto src_data_ct1 = src.d_data;
+                                     auto res_data_ct1 = res.d_data;
+
+                                     sycl::accessor<float, 2, sycl::access_mode::read_write,
+                                                    sycl::access::target::local>
+                                         xrows_acc_ct1(sycl::range<2>(16, 32), cgh);
+                                     cgh.parallel_for(
+                                         sycl::nd_range<3>(blocks * threads, threads), [=](sycl::nd_item<3> item_ct1)
+                                         [[intel::reqd_sub_group_size(32)]]
+                                         { LowPassBlockOld(src_data_ct1, res_data_ct1, width, pitch, height, item_ct1,
+                                                        d_LowPassKernel_ptr_ct1, xrows_acc_ct1); }); })
+        .wait();
+#ifdef DEVICE_TIMER
+    auto stop_kernel = std::chrono::steady_clock::now();
+    // printf("LowPassBlock time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count());
+    totTime += std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count();
+#endif
+    checkMsg("LowPass() execution failed\n");
+    return 0.0;
+  }
+  catch (sycl::exception const &e)
+  {
+    std::cout << e.what() << '\n';
+  }
+}
+
+//==================== Multi-scale functions ===================//
+
+void PrepareLaplaceKernels(int numOctaves, float initBlur, float *kernel)
+{
+  if (numOctaves > 1)
+  {
+    float totInitBlur = (float)sqrt(initBlur * initBlur + 0.5f * 0.5f) / 2.0f;
+    PrepareLaplaceKernels(numOctaves - 1, totInitBlur, kernel);
+  }
+  float scale = pow(2.0f, -1.0f / NUM_SCALES);
+  float diffScale = pow(2.0f, 1.0f / NUM_SCALES);
+  for (int i = 0; i < NUM_SCALES + 3; i++)
+  {
+    float kernelSum = 0.0f;
+    float var = scale * scale - initBlur * initBlur;
+    for (int j = 0; j <= LAPLACE_R; j++)
+    {
+      kernel[numOctaves * 12 * 16 + 16 * i + j] = (float)expf(-(double)j * j / 2.0 / var);
+      kernelSum += (j == 0 ? 1 : 2) * kernel[numOctaves * 12 * 16 + 16 * i + j];
+    }
+    for (int j = 0; j <= LAPLACE_R; j++)
+      kernel[numOctaves * 12 * 16 + 16 * i + j] /= kernelSum;
+    scale *= diffScale;
+  }
+}
+
+double LaplaceMulti(CudaImage &baseImage, CudaImage *results, int octave, sycl::queue &q_ct, float &totTime)
+{
+  int width = results[0].width;
+  int pitch = results[0].pitch;
+  int height = results[0].height;
+
+#if 1
+  sycl::range<3> threads(1, 1, LAPLACE_W + 2 * LAPLACE_R);    //(1, 1, 136)
+  sycl::range<3> blocks(1, height, iDivUp(width, LAPLACE_W)); //(1, 1080, 15)
+
+#ifdef DEVICE_TIMER
+  auto start_kernel = std::chrono::steady_clock::now();
+#endif
+
+  q_ct.submit([&](sycl::handler &cgh)
+              {
+        float *d_LaplaceKernel_ptr_ct1 = d_LaplaceKernel.get_ptr();
+        sycl::accessor<float, 1, sycl::access_mode::read_write,
+                       sycl::access::target::local>
+            buff_acc_ct1(
+                sycl::range<1>(1088 /*(LAPLACE_W + 2*LAPLACE_R)*LAPLACE_S*/), cgh);                       
+
+        float *results_d_data_ct1 = results[0].d_data;
+        float *baseImage_data_ct1 = baseImage.d_data;
+        cgh.parallel_for(
+            sycl::nd_range<3>(blocks * threads, threads),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+              LaplaceMultiMem(baseImage_data_ct1, results_d_data_ct1,
+                              width, pitch, height, octave, item_ct1,
+                              d_LaplaceKernel_ptr_ct1,
+                              buff_acc_ct1.get_pointer());
+            }); })
+      .wait();
+
+#ifdef DEVICE_TIMER
+  auto stop_kernel = std::chrono::steady_clock::now();
+  // printf("LaplaceMultiMem time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count());
+  totTime += std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count();
+#endif
+#endif
+  checkMsg("LaplaceMulti() execution failed\n");
+  return 0.0;
+}
+
+double FindPointsMulti(CudaImage *sources, SiftData &siftData, float thresh, float edgeLimit, float factor,
+                       float lowestScale, float subsampling, int octave, sycl::queue &q_ct, float &totTime)
+{
+  if (sources->d_data == NULL)
+  {
+    printf("FindPointsMulti: missing data\n");
+    return 0.0;
+  }
+  int w = sources->width;
+  int p = sources->pitch;
+  int h = sources->height;
+#if 1
+  sycl::range<3> blocks(1, iDivUp(h, MINMAX_H),
+                        iDivUp(w, MINMAX_W) * NUM_SCALES);
+  sycl::range<3> threads(1, 1, MINMAX_W + 2);
+
+#ifdef DEVICE_TIMER
+  auto start_kernel = std::chrono::steady_clock::now();
+#endif
+  auto event_FindPointsMulti = q_ct.submit([&](sycl::handler &cgh)
+                                           {
+                                     d_MaxNumPoints.init();
+                                     d_PointCounter.init();
+
+                                     auto d_MaxNumPoints_ptr_ct1 = d_MaxNumPoints.get_ptr();
+                                     auto d_PointCounter_ptr_ct1 = d_PointCounter.get_ptr();
+
+                                     sycl::accessor<unsigned short, 1, sycl::access_mode::read_write,
+                                                    sycl::access::target::local>
+                                         points_acc_ct1(sycl::range<1>(64 /*2*MEMWID*/), cgh);
+
+                                     auto sources_d_data_ct0 = sources->d_data;
+                                     auto siftData_data_ct1 = siftData.d_data;
+
+                                     cgh.parallel_for(
+                                         sycl::nd_range<3>(blocks * threads, threads), [=
+                                     ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]]
+                                         {                                           
+                                           FindPointsMultiNew(sources_d_data_ct0, siftData_data_ct1, w, p, h,
+                                                              subsampling, lowestScale, thresh, factor,
+                                                              edgeLimit, octave, item_ct1,
+                                                              *d_MaxNumPoints_ptr_ct1, d_PointCounter_ptr_ct1,
+                                                              points_acc_ct1.get_pointer());
+                                         }); });
+  event_FindPointsMulti.wait();
+#ifdef DEVICE_TIMER
+  auto stop_kernel = std::chrono::steady_clock::now();
+  // printf("FindPointsMultiNew time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count())
+  totTime += std::chrono::duration<float, std::micro>(stop_kernel - start_kernel).count();
+#endif
+#endif
+  checkMsg("FindPointsMulti() execution failed\n");
+  return 0.0;
+}
diff --git a/src/cudaSiftH.h b/src/cudaSiftH.h
new file mode 100644
index 0000000..746c25a
--- /dev/null
+++ b/src/cudaSiftH.h
@@ -0,0 +1,52 @@
+//********************************************************//
+// CUDA SIFT extractor by Marten Bjorkman aka Celebrandil //
+//********************************************************//
+
+// Modifications Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+
+#ifndef CUDASIFTH_H
+#define CUDASIFTH_H
+
+#include <sycl/sycl.hpp>
+
+#include "infra/infra.hpp"
+#include "cudautils.h"
+#include "cudaImage.h"
+#include "cudaSift.h"
+
+int ExtractSiftLoop(SiftData &siftData, CudaImage &img, int numOctaves, double initBlur, float thresh,
+                    float lowestScale, float subsampling, float *memoryTmp, float *memorySub, sycl::queue &q_ct, float &totTime);
+void ExtractSiftOctave(SiftData &siftData, CudaImage &img, int octave, float thresh, float lowestScale, float subsampling,
+                       float *memoryTmp, sycl::queue &q_ct, float &totTime);
+double ScaleDown(CudaImage &res, CudaImage &src, float variance, sycl::queue &q_ct, float &totTime);
+double ScaleUp(CudaImage &res, CudaImage &src, sycl::queue &q_ct, float &totTime);
+double ComputeOrientations(CudaImage &src, SiftData &siftData, int octave, sycl::queue &q_ct, float &totTime);
+double ExtractSiftDescriptors(float *texObj, int pitch, SiftData &siftData, float subsampling,
+                              int octave, sycl::queue &q_ct, float &totTime);
+double RescalePositions(SiftData &siftData, float scale, sycl::queue &q_ct, float &totTime);
+double LowPass(CudaImage &res, CudaImage &src, float scale, sycl::queue &q_ct, float &totTime);
+void PrepareLaplaceKernels(int numOctaves, float initBlur, float *kernel);
+double LaplaceMulti(CudaImage &baseImage, CudaImage *results, int octave, sycl::queue &q_ct, float &totTime);
+double FindPointsMulti(CudaImage *sources, SiftData &siftData, float thresh, float edgeLimit, float factor, float lowestScale,
+                       float subsampling, int octave, sycl::queue &q_ct, float &totTime);
+#endif
diff --git a/src/cudautils.h b/src/cudautils.h
new file mode 100644
index 0000000..7e1ca31
--- /dev/null
+++ b/src/cudautils.h
@@ -0,0 +1,108 @@
+// Modifications Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+
+#ifndef CUDAUTILS_H
+#define CUDAUTILS_H
+
+#include <sycl/sycl.hpp>
+#include <cstdio>
+#include <iostream>
+#include <chrono>
+
+#ifdef WIN32
+#include <intrin.h>
+#endif
+
+#define safeCall(err) __safeCall(err, __FILE__, __LINE__)
+#define checkMsg(msg) __checkMsg(msg, __FILE__, __LINE__)
+
+inline void __safeCall(int err, const char *file, const int line)
+{
+}
+
+inline void __checkMsg(const char *errorMessage, const char *file, const int line)
+{
+  int err = 0;
+}
+
+class TimerCPU
+{
+  static const int bits = 10;
+
+public:
+  long long beg_clock;
+  float freq;
+  TimerCPU(float freq_) : freq(freq_)
+  { // freq = clock frequency in MHz
+    beg_clock = getTSC(bits);
+  }
+  long long getTSC(int bits)
+  {
+#ifdef WIN32
+    return __rdtsc() / (1LL << bits);
+#else
+    unsigned int low, high;
+    __asm__(".byte 0x0f, 0x31"
+            : "=a"(low), "=d"(high));
+    return ((long long)high << (32 - bits)) | ((long long)low >> bits);
+#endif
+  }
+  float read()
+  {
+    long long end_clock = getTSC(bits);
+    long long Kcycles = end_clock - beg_clock;
+    float time = (float)(1 << bits) * Kcycles / freq / 1e3f;
+    return time;
+  }
+};
+
+template <class T>
+__inline__ T ShiftDown(T var, unsigned int delta, sycl::nd_item<3> item_ct1, int width = 32)
+{
+#if (SYCL_LANGUAGE_VERSION >= 9000)
+  return sycl::shift_group_left(item_ct1.get_sub_group(), var, delta);
+#else
+  return __shfl_down(var, delta, width);
+#endif
+}
+
+template <class T>
+__inline__ T ShiftUp(T var, unsigned int delta, sycl::nd_item<3> item_ct1, int width = 32)
+{
+#if (SYCL_LANGUAGE_VERSION >= 9000)
+  return sycl::shift_group_right(item_ct1.get_sub_group(), var, delta);
+#else
+  return __shfl_up(var, delta, width);
+#endif
+}
+
+template <class T>
+__inline__ T Shuffle(T var, unsigned int lane, sycl::nd_item<3> item_ct1, int width = 32)
+{
+#if (SYCL_LANGUAGE_VERSION >= 9000)
+  return sycl::select_from_group(item_ct1.get_sub_group(), var, lane);
+#else
+  return __shfl(var, lane, width);
+#endif
+}
+
+#endif
diff --git a/geomFuncs.cpp b/src/geomFuncs.cpp
similarity index 100%
rename from geomFuncs.cpp
rename to src/geomFuncs.cpp
diff --git a/src/infra/atomic.hpp b/src/infra/atomic.hpp
new file mode 100644
index 0000000..80c78d2
--- /dev/null
+++ b/src/infra/atomic.hpp
@@ -0,0 +1,317 @@
+//==---- atomic.hpp -------------------------------*- C++ -*----------------==//
+// Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+//===----------------------------------------------------------------------===//
+
+#ifndef __INFRA_ATOMIC_HPP__
+#define __INFRA_ATOMIC_HPP__
+
+#include <sycl/sycl.hpp>
+
+namespace infra
+{
+
+  /// Atomically add the value operand to the value at the addr and assign the
+  /// result to the value at addr, Int version.
+  /// \param [in, out] addr The pointer to the data.
+  /// \param operand The value to add to the value at \p addr.
+  /// \param memoryOrder The memory ordering used.
+  /// \returns The value at the \p addr before the call.
+  template <typename T, sycl::access::address_space addressSpace =
+                            sycl::access::address_space::global_space>
+  inline T atomic_fetch_add(
+      T *addr, T operand,
+      sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+  {
+    sycl::atomic<T, addressSpace> obj(
+        (sycl::multi_ptr<T, addressSpace>(addr)));
+    return sycl::atomic_fetch_add(obj, operand, memoryOrder);
+  }
+
+  /// Atomically add the value operand to the value at the addr and assign the
+  /// result to the value at addr, Float version.
+  /// \param [in, out] addr The pointer to the data.
+  /// \param operand The value to add to the value at \p addr.
+  /// \param memoryOrder The memory ordering used.
+  /// \returns The value at the \p addr before the call.
+  template <sycl::access::address_space addressSpace =
+                sycl::access::address_space::global_space>
+  inline float atomic_fetch_add(
+      float *addr, float operand,
+      sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+  {
+    static_assert(sizeof(float) == sizeof(int), "Mismatched type size");
+
+    sycl::atomic<int, addressSpace> obj(
+        (sycl::multi_ptr<int, addressSpace>(reinterpret_cast<int *>(addr))));
+
+    int old_value;
+    float old_float_value;
+
+    do
+    {
+      old_value = obj.load(memoryOrder);
+      old_float_value = *reinterpret_cast<const float *>(&old_value);
+      const float new_float_value = old_float_value + operand;
+      const int new_value = *reinterpret_cast<const int *>(&new_float_value);
+      if (obj.compare_exchange_strong(old_value, new_value, memoryOrder))
+        break;
+    } while (true);
+
+    return old_float_value;
+  }
+
+  /// Atomically add the value operand to the value at the addr and assign the
+  /// result to the value at addr, Double version.
+  /// \param [in, out] addr The pointer to the data.
+  /// \param operand The value to add to the value at \p addr
+  /// \param memoryOrder The memory ordering used.
+  /// \returns The value at the \p addr before the call.
+  template <sycl::access::address_space addressSpace =
+                sycl::access::address_space::global_space>
+  inline double atomic_fetch_add(
+      double *addr, double operand,
+      sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+  {
+    static_assert(sizeof(double) == sizeof(unsigned long long int),
+                  "Mismatched type size");
+
+    sycl::atomic<unsigned long long int, addressSpace> obj(
+        (sycl::multi_ptr<unsigned long long int, addressSpace>(
+            reinterpret_cast<unsigned long long int *>(addr))));
+
+    unsigned long long int old_value;
+    double old_double_value;
+
+    do
+    {
+      old_value = obj.load(memoryOrder);
+      old_double_value = *reinterpret_cast<const double *>(&old_value);
+      const double new_double_value = old_double_value + operand;
+      const unsigned long long int new_value =
+          *reinterpret_cast<const unsigned long long int *>(&new_double_value);
+
+      if (obj.compare_exchange_strong(old_value, new_value, memoryOrder))
+        break;
+    } while (true);
+
+    return old_double_value;
+  }
+
+  /// Atomically subtract the value operand from the value at the addr and assign
+  /// the result to the value at addr.
+  /// \param [in, out] addr The pointer to the data.
+  /// \param operand The value to substract from the value at \p addr
+  /// \param memoryOrder The memory ordering used.
+  /// \returns The value at the \p addr before the call.
+  template <typename T, sycl::access::address_space addressSpace =
+                            sycl::access::address_space::global_space>
+  inline T atomic_fetch_sub(
+      T *addr, T operand,
+      sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+  {
+    sycl::atomic<T, addressSpace> obj(
+        (sycl::multi_ptr<T, addressSpace>(addr)));
+    return sycl::atomic_fetch_sub(obj, operand, memoryOrder);
+  }
+
+  /// Atomically perform a bitwise AND between the value operand and the value at the addr
+  /// and assign the result to the value at addr.
+  /// \param [in, out] addr The pointer to the data.
+  /// \param operand The value to use in bitwise AND operation with the value at the \p addr.
+  /// \param memoryOrder The memory ordering used.
+  /// \returns The value at the \p addr before the call.
+  template <typename T, sycl::access::address_space addressSpace =
+                            sycl::access::address_space::global_space>
+  inline T atomic_fetch_and(
+      T *addr, T operand,
+      sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+  {
+    sycl::atomic<T, addressSpace> obj(
+        (sycl::multi_ptr<T, addressSpace>(addr)));
+    return sycl::atomic_fetch_and(obj, operand, memoryOrder);
+  }
+
+  /// Atomically or the value at the addr with the value operand, and assign
+  /// the result to the value at addr.
+  /// \param [in, out] addr The pointer to the data.
+  /// \param operand The value to use in bitwise OR operation with the value at the \p addr.
+  /// \param memoryOrder The memory ordering used.
+  /// \returns The value at the \p addr before the call.
+  template <typename T, sycl::access::address_space addressSpace =
+                            sycl::access::address_space::global_space>
+  inline T atomic_fetch_or(
+      T *addr, T operand,
+      sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+  {
+    sycl::atomic<T, addressSpace> obj(
+        (sycl::multi_ptr<T, addressSpace>(addr)));
+    return sycl::atomic_fetch_or(obj, operand, memoryOrder);
+  }
+
+  /// Atomically xor the value at the addr with the value operand, and assign
+  /// the result to the value at addr.
+  /// \param [in, out] addr The pointer to the data.
+  /// \param operand The value to use in bitwise XOR operation with the value at the \p addr.
+  /// \param memoryOrder The memory ordering used.
+  /// \returns The value at the \p addr before the call.
+  template <typename T, sycl::access::address_space addressSpace =
+                            sycl::access::address_space::global_space>
+  inline T atomic_fetch_xor(
+      T *addr, T operand,
+      sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+  {
+    sycl::atomic<T, addressSpace> obj(
+        (sycl::multi_ptr<T, addressSpace>(addr)));
+    return sycl::atomic_fetch_xor(obj, operand, memoryOrder);
+  }
+
+  /// Atomically calculate the minimum of the value at addr and the value operand
+  /// and assign the result to the value at addr.
+  /// \param [in, out] addr The pointer to the data.
+  /// \param operand.
+  /// \param memoryOrder The memory ordering used.
+  /// \returns The value at the \p addr before the call.
+  template <typename T, sycl::access::address_space addressSpace =
+                            sycl::access::address_space::global_space>
+  inline T atomic_fetch_min(
+      T *addr, T operand,
+      sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+  {
+    sycl::atomic<T, addressSpace> obj(
+        (sycl::multi_ptr<T, addressSpace>(addr)));
+    return sycl::atomic_fetch_min(obj, operand, memoryOrder);
+  }
+
+  /// Atomically calculate the maximum of the value at addr and the value operand
+  /// and assign the result to the value at addr.
+  /// \param [in, out] addr The pointer to the data.
+  /// \param operand.
+  /// \param memoryOrder The memory ordering used.
+  /// \returns The value at the \p addr before the call.
+  template <typename T, sycl::access::address_space addressSpace =
+                            sycl::access::address_space::global_space>
+  inline T atomic_fetch_max(
+      T *addr, T operand,
+      sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+  {
+    sycl::atomic<T, addressSpace> obj(
+        (sycl::multi_ptr<T, addressSpace>(addr)));
+    return sycl::atomic_fetch_max(obj, operand, memoryOrder);
+  }
+
+  /// Atomically increment the value stored in \p addr if old value stored in \p
+  /// addr is less than \p operand, else set 0 to the value stored in \p addr.
+  /// \param [in, out] addr The pointer to the data.
+  /// \param operand The threshold value.
+  /// \param memoryOrder The memory ordering used.
+  /// \returns The old value stored in \p addr.
+  template <sycl::access::address_space addressSpace =
+                sycl::access::address_space::global_space>
+  inline unsigned int atomic_fetch_compare_inc(
+      unsigned int *addr, unsigned int operand,
+      sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+  {
+    sycl::atomic<unsigned int, addressSpace> obj(
+        (sycl::multi_ptr<unsigned int, addressSpace>(addr)));
+    unsigned int old;
+    while (true)
+    {
+      old = obj.load();
+      if (old >= operand)
+      {
+        if (obj.compare_exchange_strong(old, 0, memoryOrder, memoryOrder))
+          break;
+      }
+      else
+      {
+        old = obj.fetch_add(1);
+        break;
+      }
+      // else if (obj.compare_exchange_strong(old, old + 1, memoryOrder,
+      //                                      memoryOrder))
+      // break;
+    }
+    return old;
+  }
+
+  /// Atomically exchange the value at the address addr with the value operand.
+  /// \param [in, out] addr The pointer to the data.
+  /// \param operand The value to be exchanged with the value pointed by \p addr.
+  /// \param memoryOrder The memory ordering used.
+  /// \returns The value at the \p addr before the call.
+  template <typename T, sycl::access::address_space addressSpace =
+                            sycl::access::address_space::global_space>
+  inline T atomic_exchange(
+      T *addr, T operand,
+      sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+  {
+    sycl::atomic<T, addressSpace> obj(
+        (sycl::multi_ptr<T, addressSpace>(addr)));
+    return sycl::atomic_exchange(obj, operand, memoryOrder);
+  }
+
+  /// Atomically compare the value at \p addr to the value expected and exchange
+  /// with the value desired if the value at \p addr is equal to the value expected.
+  /// Returns the value at the \p addr before the call.
+  /// \param [in, out] addr Multi_ptr.
+  /// \param expected The value to compare against the value at \p addr.
+  /// \param desired The value to assign to \p addr if the value at \p addr is expected.
+  /// \param success The memory ordering used when comparison succeeds.
+  /// \param fail The memory ordering used when comparison fails.
+  /// \returns The value at the \p addr before the call.
+  template <typename T, sycl::access::address_space addressSpace =
+                            sycl::access::address_space::global_space>
+  T atomic_compare_exchange_strong(
+      sycl::multi_ptr<T, sycl::access::address_space::global_space> addr,
+      T expected, T desired,
+      sycl::memory_order success = sycl::memory_order::relaxed,
+      sycl::memory_order fail = sycl::memory_order::relaxed)
+  {
+    sycl::atomic<T, addressSpace> obj(addr);
+    obj.compare_exchange_strong(expected, desired, success, fail);
+    return expected;
+  }
+
+  /// Atomically compare the value at \p addr to the value expected and exchange
+  /// with the value desired if the value at \p addr is equal to the value expected.
+  /// Returns the value at the \p addr before the call.
+  /// \param [in] addr The pointer to the data.
+  /// \param expected The value to compare against the value at \p addr.
+  /// \param desired The value to assign to \p addr if the value at \p addr is expected.
+  /// \param success The memory ordering used when comparison succeeds.
+  /// \param fail The memory ordering used when comparison fails.
+  /// \returns The value at the \p addr before the call.
+  template <typename T, sycl::access::address_space addressSpace =
+                            sycl::access::address_space::global_space>
+  T atomic_compare_exchange_strong(
+      T *addr, T expected, T desired,
+      sycl::memory_order success = sycl::memory_order::relaxed,
+      sycl::memory_order fail = sycl::memory_order::relaxed)
+  {
+    return atomic_compare_exchange_strong(
+        sycl::multi_ptr<T, addressSpace>(addr), expected, desired, success,
+        fail);
+  }
+
+}
+#endif
diff --git a/src/infra/device.hpp b/src/infra/device.hpp
new file mode 100644
index 0000000..1c91892
--- /dev/null
+++ b/src/infra/device.hpp
@@ -0,0 +1,533 @@
+//==---- device.hpp -------------------------------*- C++ -*----------------==//
+// Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+//===----------------------------------------------------------------------===//
+
+#ifndef __INFRA_DEVICE_HPP__
+#define __INFRA_DEVICE_HPP__
+
+#include <sycl/sycl.hpp>
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <map>
+#include <vector>
+#include <thread>
+#if defined(__linux__)
+#include <unistd.h>
+#include <sys/syscall.h>
+#endif
+#if defined(_WIN64)
+#define NOMINMAX
+#include <windows.h>
+#endif
+
+namespace infra
+{
+
+  /// DPC++ default exception handler
+  auto exception_handler = [](sycl::exception_list exceptions)
+  {
+    for (std::exception_ptr const &e : exceptions)
+    {
+      try
+      {
+        std::rethrow_exception(e);
+      }
+      catch (sycl::exception const &e)
+      {
+        std::cerr << "Caught asynchronous SYCL exception:" << std::endl
+                  << e.what() << std::endl
+                  << "Exception caught at file:" << __FILE__
+                  << ", line:" << __LINE__ << std::endl;
+      }
+    }
+  };
+
+  class device_info
+  {
+  public:
+    // get interface
+    char *get_name() { return _name; }
+    sycl::id<3> get_max_work_item_sizes() { return _max_work_item_sizes; }
+    bool get_host_unified_memory() { return _host_unified_memory; }
+    int get_major_version() { return _major; }
+    int get_minor_version() { return _minor; }
+    int get_integrated() { return _integrated; }
+    int get_max_clock_frequency() { return _frequency; }
+    int get_max_compute_units() { return _max_compute_units; }
+    int get_max_work_group_size() { return _max_work_group_size; }
+    int get_max_sub_group_size() { return _max_sub_group_size; }
+    int get_max_work_items_per_compute_unit()
+    {
+      return _max_work_items_per_compute_unit;
+    }
+    size_t *get_max_nd_range_size() { return _max_nd_range_size; }
+    size_t get_global_mem_size() { return _global_mem_size; }
+    size_t get_local_mem_size() { return _local_mem_size; }
+    // set interface
+    void set_name(const char *name) { std::strncpy(_name, name, 256); }
+    void set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
+    {
+      _max_work_item_sizes = max_work_item_sizes;
+    }
+    void set_host_unified_memory(bool host_unified_memory)
+    {
+      _host_unified_memory = host_unified_memory;
+    }
+    void set_major_version(int major) { _major = major; }
+    void set_minor_version(int minor) { _minor = minor; }
+    void set_integrated(int integrated) { _integrated = integrated; }
+    void set_max_clock_frequency(int frequency) { _frequency = frequency; }
+    void set_max_compute_units(int max_compute_units)
+    {
+      _max_compute_units = max_compute_units;
+    }
+    void set_global_mem_size(size_t global_mem_size)
+    {
+      _global_mem_size = global_mem_size;
+    }
+    void set_local_mem_size(size_t local_mem_size)
+    {
+      _local_mem_size = local_mem_size;
+    }
+    void set_max_work_group_size(int max_work_group_size)
+    {
+      _max_work_group_size = max_work_group_size;
+    }
+    void set_max_sub_group_size(int max_sub_group_size)
+    {
+      _max_sub_group_size = max_sub_group_size;
+    }
+    void
+    set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
+    {
+      _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
+    }
+    void set_max_nd_range_size(int max_nd_range_size[])
+    {
+      for (int i = 0; i < 3; i++)
+        _max_nd_range_size[i] = max_nd_range_size[i];
+    }
+
+  private:
+    char _name[256];
+    sycl::id<3> _max_work_item_sizes;
+    bool _host_unified_memory = false;
+    int _major;
+    int _minor;
+    int _integrated = 0;
+    int _frequency;
+    int _max_compute_units;
+    int _max_work_group_size;
+    int _max_sub_group_size;
+    int _max_work_items_per_compute_unit;
+    size_t _global_mem_size;
+    size_t _local_mem_size;
+    size_t _max_nd_range_size[3];
+  };
+
+  class device_ext : public sycl::device
+  {
+  public:
+    device_ext() : sycl::device(), _ctx(*this) {}
+    ~device_ext()
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      for (auto &task : _tasks)
+      {
+        if (task.joinable())
+          task.join();
+      }
+      _tasks.clear();
+      _queues.clear();
+    }
+    device_ext(const sycl::device &base)
+        : sycl::device(base), _ctx(*this)
+    {
+#ifdef INFRA_USM_LEVEL_NONE
+      _queues.push_back(
+          std::make_shared<sycl::queue>(_ctx, base, exception_handler));
+#else
+      _queues.push_back(std::make_shared<sycl::queue>(
+          _ctx, base, exception_handler, sycl::property::queue::in_order()));
+#endif
+      _saved_queue = _default_queue = _queues[0].get();
+    }
+
+    int is_native_atomic_supported() { return 0; }
+    int get_major_version()
+    {
+      int major, minor;
+      get_version(major, minor);
+      return major;
+    }
+
+    int get_minor_version()
+    {
+      int major, minor;
+      get_version(major, minor);
+      return minor;
+    }
+
+    int get_max_compute_units()
+    {
+      return get_device_info().get_max_compute_units();
+    }
+
+    int get_max_clock_frequency()
+    {
+      return get_device_info().get_max_clock_frequency();
+    }
+
+    int get_integrated() { return get_device_info().get_integrated(); }
+
+    void get_device_info(device_info &out)
+    {
+      device_info prop;
+      prop.set_name(get_info<sycl::info::device::name>().c_str());
+
+      int major, minor;
+      get_version(major, minor);
+      prop.set_major_version(major);
+      prop.set_minor_version(minor);
+
+      prop.set_max_work_item_sizes(
+          get_info<sycl::info::device::max_work_item_sizes<3>>());
+      prop.set_host_unified_memory(
+          get_info<sycl::info::device::host_unified_memory>());
+
+      // max_clock_frequency parameter is not supported on host device
+      if (is_host())
+      {
+        // This code may need to be updated. Currently max_clock_frequency for
+        // host device is initialized with 1, in assumption that if other devices
+        // exist and they are being selected based on this parameter, other
+        // devices would have higher priority.
+        prop.set_max_clock_frequency(1);
+      }
+      else
+      {
+        prop.set_max_clock_frequency(
+            get_info<sycl::info::device::max_clock_frequency>());
+      }
+
+      prop.set_max_compute_units(
+          get_info<sycl::info::device::max_compute_units>());
+      prop.set_max_work_group_size(
+          get_info<sycl::info::device::max_work_group_size>());
+      prop.set_global_mem_size(
+          get_info<sycl::info::device::global_mem_size>());
+      prop.set_local_mem_size(get_info<sycl::info::device::local_mem_size>());
+
+      size_t max_sub_group_size = 1;
+      std::vector<size_t> sub_group_sizes =
+          get_info<sycl::info::device::sub_group_sizes>();
+
+      for (const auto &sub_group_size : sub_group_sizes)
+      {
+        if (max_sub_group_size < sub_group_size)
+          max_sub_group_size = sub_group_size;
+      }
+
+      prop.set_max_sub_group_size(max_sub_group_size);
+
+      prop.set_max_work_items_per_compute_unit(
+          get_info<sycl::info::device::max_work_group_size>());
+      int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+      prop.set_max_nd_range_size(max_nd_range_size);
+
+      out = prop;
+    }
+
+    device_info get_device_info()
+    {
+      device_info prop;
+      get_device_info(prop);
+      return prop;
+    }
+
+    void reset()
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      // The queues are shared_ptrs and the ref counts of the shared_ptrs increase
+      // only in wait_and_throw(). If there is no other thread calling
+      // wait_and_throw(), the queues will be destructed. The destructor waits for
+      // all commands executing on the queue to complete. It isn't possible to
+      // destroy a queue immediately. This is a synchronization point in SYCL.
+      _queues.clear();
+      // create new default queue.
+#ifdef INFRA_USM_LEVEL_NONE
+      _queues.push_back(
+          std::make_shared<sycl::queue>(_ctx, *this, exception_handler));
+#else
+      _queues.push_back(std::make_shared<sycl::queue>(
+          _ctx, *this, exception_handler, sycl::property::queue::in_order()));
+#endif
+      _saved_queue = _default_queue = _queues.front().get();
+    }
+
+    sycl::queue &default_queue() { return *_default_queue; }
+
+    void queues_wait_and_throw()
+    {
+      std::unique_lock<std::mutex> lock(m_mutex);
+      std::vector<std::shared_ptr<sycl::queue>> current_queues(
+          _queues);
+      lock.unlock();
+      for (const auto &q : current_queues)
+      {
+        q->wait_and_throw();
+      }
+      // Guard the destruct of current_queues to make sure the ref count is safe.
+      lock.lock();
+    }
+    sycl::queue *create_queue(bool enable_exception_handler = false)
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      sycl::async_handler eh = {};
+      if (enable_exception_handler)
+      {
+        eh = exception_handler;
+      }
+#ifdef INFRA_USM_LEVEL_NONE
+      _queues.push_back(std::make_shared<sycl::queue>(
+          _ctx, *this, eh));
+#else
+      _queues.push_back(std::make_shared<sycl::queue>(
+          _ctx, *this, eh,
+          sycl::property::queue::in_order()));
+#endif
+      return _queues.back().get();
+    }
+    void destroy_queue(sycl::queue *&queue)
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                   [=](const std::shared_ptr<sycl::queue> &q) -> bool
+                                   {
+                                     return q.get() == queue;
+                                   }),
+                    _queues.end());
+      queue = nullptr;
+    }
+    void set_saved_queue(sycl::queue *q)
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      _saved_queue = q;
+    }
+    sycl::queue *get_saved_queue()
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      return _saved_queue;
+    }
+    sycl::context get_context() { return _ctx; }
+
+  private:
+    void get_version(int &major, int &minor)
+    {
+      // Version string has the following format:
+      // a. OpenCL<space><major.minor><space><vendor-specific-information>
+      // b. <major.minor>
+      std::string ver;
+      ver = get_info<sycl::info::device::version>();
+      std::string::size_type i = 0;
+      while (i < ver.size())
+      {
+        if (isdigit(ver[i]))
+          break;
+        i++;
+      }
+      major = std::stoi(&(ver[i]));
+      while (i < ver.size())
+      {
+        if (ver[i] == '.')
+          break;
+        i++;
+      }
+      i++;
+      minor = std::stoi(&(ver[i]));
+    }
+    void add_task(std::thread &&task)
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      _tasks.push_back(std::move(task));
+    }
+    friend void async_infra_free(std::vector<void *>,
+                                 std::vector<sycl::event>,
+                                 sycl::queue &);
+    sycl::queue *_default_queue;
+    sycl::queue *_saved_queue;
+    sycl::context _ctx;
+    std::vector<std::shared_ptr<sycl::queue>> _queues;
+    mutable std::mutex m_mutex;
+    std::vector<std::thread> _tasks;
+  };
+
+  static inline unsigned int get_tid()
+  {
+#if defined(__linux__)
+    return syscall(SYS_gettid);
+#elif defined(_WIN64)
+    return GetCurrentThreadId();
+#else
+#error "Only support Windows and Linux."
+#endif
+  }
+
+  /// device manager
+  class dev_mgr
+  {
+  public:
+    device_ext &current_device()
+    {
+      unsigned int dev_id = current_device_id();
+      check_id(dev_id);
+      return *_devs[dev_id];
+    }
+    device_ext &cpu_device() const
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      if (_cpu_device == -1)
+      {
+        throw std::runtime_error("no valid cpu device");
+      }
+      else
+      {
+        return *_devs[_cpu_device];
+      }
+    }
+    device_ext &get_device(unsigned int id) const
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      check_id(id);
+      return *_devs[id];
+    }
+    unsigned int current_device_id() const
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      auto it = _thread2dev_map.find(get_tid());
+      if (it != _thread2dev_map.end())
+        return it->second;
+      return DEFAULT_DEVICE_ID;
+    }
+    void select_device(unsigned int id)
+    {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      check_id(id);
+      _thread2dev_map[get_tid()] = id;
+    }
+    unsigned int device_count() { return _devs.size(); }
+
+    /// Returns the instance of device manager singleton.
+    static dev_mgr &instance()
+    {
+      static dev_mgr d_m;
+      return d_m;
+    }
+    dev_mgr(const dev_mgr &) = delete;
+    dev_mgr &operator=(const dev_mgr &) = delete;
+    dev_mgr(dev_mgr &&) = delete;
+    dev_mgr &operator=(dev_mgr &&) = delete;
+
+  private:
+    mutable std::mutex m_mutex;
+    dev_mgr()
+    {
+      sycl::device default_device =
+          sycl::device(sycl::default_selector{});
+      _devs.push_back(std::make_shared<device_ext>(default_device));
+
+      std::vector<sycl::device> sycl_all_devs =
+          sycl::device::get_devices(sycl::info::device_type::all);
+      // sycl::device::get_devices(sycl::info::device_type::gpu);
+      // Collect other devices except for the default device.
+      if (default_device.is_cpu())
+        _cpu_device = 0;
+      for (auto &dev : sycl_all_devs)
+      {
+        if (dev == default_device)
+        {
+          continue;
+        }
+        _devs.push_back(std::make_shared<device_ext>(dev));
+        if (_cpu_device == -1 && dev.is_cpu())
+        {
+          _cpu_device = _devs.size() - 1;
+        }
+      }
+    }
+    void check_id(unsigned int id) const
+    {
+      if (id >= _devs.size())
+      {
+        throw std::runtime_error("invalid device id");
+      }
+    }
+    std::vector<std::shared_ptr<device_ext>> _devs;
+    /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
+    /// thread id in _thread2dev_map, which means default device should be used
+    /// for the current thread.
+    const unsigned int DEFAULT_DEVICE_ID = 0;
+    /// thread-id to device-id map.
+    std::map<unsigned int, unsigned int> _thread2dev_map;
+    int _cpu_device = -1;
+  };
+
+  /// Util function to get the defualt queue of current device in
+  /// device manager.
+  static inline sycl::queue &get_default_queue()
+  {
+    return dev_mgr::instance().current_device().default_queue();
+  }
+
+  /// Util function to get the current device.
+  static inline device_ext &get_current_device()
+  {
+    return dev_mgr::instance().current_device();
+  }
+
+  /// Util function to get a device by id.
+  static inline device_ext &get_device(unsigned int id)
+  {
+    return dev_mgr::instance().get_device(id);
+  }
+
+  /// Util function to get the context of the default queue of current
+  /// device in device manager.
+  static inline sycl::context get_default_context()
+  {
+    return infra::get_current_device().get_context();
+  }
+
+  /// Util function to get a cpu device.
+  static inline device_ext &cpu_device()
+  {
+    return dev_mgr::instance().cpu_device();
+  }
+
+}
+
+#endif
diff --git a/src/infra/infra.hpp b/src/infra/infra.hpp
new file mode 100644
index 0000000..4985128
--- /dev/null
+++ b/src/infra/infra.hpp
@@ -0,0 +1,35 @@
+// Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+//===----------------------------------------------------------------------===//
+
+#ifndef __INFRA_HPP__
+#define __INFRA_HPP__
+
+#include <sycl/sycl.hpp>
+#include <iostream>
+#include <limits.h>
+
+#include "atomic.hpp"
+#include "device.hpp"
+#include "memory.hpp"
+
+#endif // __INFRA_HPP__
diff --git a/src/infra/memory.hpp b/src/infra/memory.hpp
new file mode 100644
index 0000000..cde7ef7
--- /dev/null
+++ b/src/infra/memory.hpp
@@ -0,0 +1,1291 @@
+//==---- memory.hpp -------------------------------*- C++ -*----------------==//
+// Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+//===----------------------------------------------------------------------===//
+
+#ifndef __INFRA_MEMORY_HPP__
+#define __INFRA_MEMORY_HPP__
+
+#include "device.hpp"
+#include <sycl/sycl.hpp>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <mutex>
+#include <unordered_map>
+#include <map>
+#include <utility>
+#include <thread>
+#include <type_traits>
+
+#if defined(__linux__)
+#include <sys/mman.h>
+#elif defined(_WIN64)
+#define NOMINMAX
+#include <windows.h>
+#else
+#error "Only support Windows and Linux."
+#endif
+
+namespace infra
+{
+
+  enum memcpy_direction
+  {
+    host_to_host,
+    host_to_device,
+    device_to_host,
+    device_to_device,
+    automatic
+  };
+  enum memory_region
+  {
+    global = 0, // device global memory
+    constant,   // device constant memory
+    local,      // device local memory
+    shared,     // memory which can be accessed by host and device
+  };
+
+  typedef uint8_t byte_t;
+
+  /// Buffer type to be used in Memory Management runtime.
+  typedef sycl::buffer<byte_t> buffer_t;
+
+  /// Pitched 2D/3D memory data.
+  class pitched_data
+  {
+  public:
+    pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
+    pitched_data(void *data, size_t pitch, size_t x, size_t y)
+        : _data(data), _pitch(pitch), _x(x), _y(y) {}
+
+    void *get_data_ptr() { return _data; }
+    void set_data_ptr(void *data) { _data = data; }
+
+    size_t get_pitch() { return _pitch; }
+    void set_pitch(size_t pitch) { _pitch = pitch; }
+
+    size_t get_x() { return _x; }
+    void set_x(size_t x) { _x = x; };
+
+    size_t get_y() { return _y; }
+    void set_y(size_t y) { _y = y; }
+
+  private:
+    void *_data;
+    size_t _pitch, _x, _y;
+  };
+
+  namespace detail
+  {
+    class mem_mgr
+    {
+      mem_mgr()
+      {
+        // Reserved address space, no real memory allocation happens here.
+#if defined(__linux__)
+        mapped_address_space =
+            (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
+                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#elif defined(_WIN64)
+        mapped_address_space = (byte_t *)VirtualAlloc(
+            NULL,               // NULL specified as the base address parameter
+            mapped_region_size, // Size of allocation
+            MEM_RESERVE,        // Allocate reserved pages
+            PAGE_NOACCESS);     // Protection = no access
+#else
+#error "Only support Windows and Linux."
+#endif
+        next_free = mapped_address_space;
+      };
+
+    public:
+      using buffer_id_t = int;
+
+      struct allocation
+      {
+        buffer_t buffer;
+        byte_t *alloc_ptr;
+        size_t size;
+      };
+
+      ~mem_mgr()
+      {
+#if defined(__linux__)
+        munmap(mapped_address_space, mapped_region_size);
+#elif defined(_WIN64)
+        VirtualFree(mapped_address_space, 0, MEM_RELEASE);
+#else
+#error "Only support Windows and Linux."
+#endif
+      };
+
+      mem_mgr(const mem_mgr &) = delete;
+      mem_mgr &operator=(const mem_mgr &) = delete;
+      mem_mgr(mem_mgr &&) = delete;
+      mem_mgr &operator=(mem_mgr &&) = delete;
+
+      /// Allocate
+      void *mem_alloc(size_t size)
+      {
+        if (!size)
+          return nullptr;
+        std::lock_guard<std::mutex> lock(m_mutex);
+        if (next_free + size > mapped_address_space + mapped_region_size)
+        {
+          throw std::runtime_error("sift_malloc: out of memory for virtual memory pool");
+        }
+        // Allocation
+        sycl::range<1> r(size);
+        buffer_t buf(r);
+        allocation A{buf, next_free, size};
+        // Map allocation to device pointer
+        void *result = next_free;
+        m_map.emplace(next_free + size, A);
+        // Update pointer to the next free space.
+        next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
+
+        return result;
+      }
+
+      /// Deallocate
+      void mem_free(const void *ptr)
+      {
+        if (!ptr)
+          return;
+        std::lock_guard<std::mutex> lock(m_mutex);
+        auto it = get_map_iterator(ptr);
+        m_map.erase(it);
+      }
+
+      /// map: device pointer -> allocation(buffer, alloc_ptr, size)
+      allocation translate_ptr(const void *ptr)
+      {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        auto it = get_map_iterator(ptr);
+        return it->second;
+      }
+
+      /// Check if the pointer represents device pointer or not.
+      bool is_device_ptr(const void *ptr) const
+      {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        return (mapped_address_space <= ptr) &&
+               (ptr < mapped_address_space + mapped_region_size);
+      }
+
+      /// Returns the instance of memory manager singleton.
+      static mem_mgr &instance()
+      {
+        static mem_mgr m;
+        return m;
+      }
+
+    private:
+      std::map<byte_t *, allocation> m_map;
+      mutable std::mutex m_mutex;
+      byte_t *mapped_address_space;
+      byte_t *next_free;
+      const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
+      const size_t alignment = 256;
+      /// This padding may be defined to some positive value to debug
+      /// out of bound accesses.
+      const size_t extra_padding = 0;
+
+      std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
+      {
+        auto it = m_map.upper_bound((byte_t *)ptr);
+        if (it == m_map.end())
+        {
+          // Not a virtual pointer.
+          throw std::runtime_error("can not get buffer from non-virtual pointer");
+        }
+        const allocation &alloc = it->second;
+        if (ptr < alloc.alloc_ptr)
+        {
+          // Out of bound.
+          // This may happen if there's a gap between allocations due to alignment
+          // or extra padding and pointer points to this gap.
+          throw std::runtime_error("invalid virtual pointer");
+        }
+        return it;
+      }
+    };
+
+    template <class T, memory_region Memory, size_t Dimension>
+    class accessor;
+    template <memory_region Memory, class T = byte_t>
+    class memory_traits
+    {
+    public:
+      static constexpr sycl::access::address_space asp =
+          (Memory == local)
+              ? sycl::access::address_space::local_space
+              : ((Memory == constant)
+                     ? sycl::access::address_space::constant_space
+                     : sycl::access::address_space::global_space);
+      static constexpr sycl::access::target target =
+          (Memory == local)
+              ? sycl::access::target::local
+              : ((Memory == constant) ? sycl::access::target::constant_buffer
+                                      : sycl::access::target::global_buffer);
+      static constexpr sycl::access_mode mode =
+          (Memory == constant) ? sycl::access_mode::read
+                               : sycl::access_mode::read_write;
+      static constexpr size_t type_size = sizeof(T);
+      using element_t =
+          typename std::conditional<Memory == constant, const T, T>::type;
+      using value_t = typename std::remove_cv<T>::type;
+      template <size_t Dimension = 1>
+      using accessor_t = sycl::accessor<T, Dimension, mode, target>;
+      using pointer_t = T *;
+    };
+
+    static inline void *sift_malloc(size_t size, sycl::queue &q)
+    {
+#ifdef INFRA_USM_LEVEL_NONE
+      return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
+#else
+      return sycl::malloc_device(size, q.get_device(), q.get_context());
+#endif
+    }
+
+#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
+    static inline void *sift_malloc(size_t &pitch, size_t x, size_t y, size_t z,
+                                    sycl::queue &q)
+    {
+      pitch = PITCH_DEFAULT_ALIGN(x);
+      return sift_malloc(pitch * y * z, q);
+    }
+
+    /// Set \p value to the first \p size bytes starting from \p dev_ptr in \p q.
+    ///
+    /// \param q The queue in which the operation is done.
+    /// \param dev_ptr Pointer to the device memory address.
+    /// \param value Value to be set.
+    /// \param size Number of bytes to be set to the value.
+    /// \returns An event representing the memset operation.
+    static inline sycl::event sift_memset(sycl::queue &q, void *dev_ptr,
+                                          int value, size_t size)
+    {
+#ifdef INFRA_USM_LEVEL_NONE
+      auto &mm = mem_mgr::instance();
+      assert(mm.is_device_ptr(dev_ptr));
+      auto alloc = mm.translate_ptr(dev_ptr);
+      size_t offset = (byte_t *)dev_ptr - alloc.alloc_ptr;
+
+      return q.submit([&](sycl::handler &cgh)
+                      {
+    auto r = sycl::range<1>(size);
+    auto o = sycl::id<1>(offset);
+    sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                       sycl::access::target::global_buffer>
+        acc(alloc.buffer, cgh, r, o);
+    cgh.fill(acc, (byte_t)value); });
+#else
+      return q.memset(dev_ptr, value, size);
+#endif
+    }
+
+    /// Set \p value to the 3D memory region pointed by \p data in \p q. \p size
+    /// specifies the 3D memory size to set.
+    ///
+    /// \param q The queue in which the operation is done.
+    /// \param data Pointer to the device memory region.
+    /// \param value Value to be set.
+    /// \param size Memory region size.
+    /// \returns An event list representing the memset operations..
+    static inline std::vector<sycl::event>
+    sift_memset(sycl::queue &q, pitched_data data, int value,
+                sycl::range<3> size)
+    {
+      std::vector<sycl::event> event_list;
+      size_t slice = data.get_pitch() * data.get_y();
+      unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
+      for (size_t z = 0; z < size.get(2); ++z)
+      {
+        unsigned char *data_ptr = data_surface;
+        for (size_t y = 0; y < size.get(1); ++y)
+        {
+          event_list.push_back(sift_memset(q, data_ptr, value, size.get(0)));
+          data_ptr += data.get_pitch();
+        }
+        data_surface += slice;
+      }
+      return event_list;
+    }
+
+    /// memset 2D matrix with pitch.
+    static inline std::vector<sycl::event>
+    sift_memset(sycl::queue &q, void *ptr, size_t pitch, int val, size_t x,
+                size_t y)
+    {
+      return sift_memset(q, pitched_data(ptr, pitch, x, 1), val,
+                         sycl::range<3>(x, y, 1));
+    }
+
+    static sycl::event sift_memcpy(sycl::queue &q, void *to_ptr,
+                                   const void *from_ptr, size_t size,
+                                   memcpy_direction direction)
+    {
+      if (!size)
+        return sycl::event{};
+#ifdef INFRA_USM_LEVEL_NONE
+      auto &mm = mem_mgr::instance();
+      memcpy_direction real_direction = direction;
+      switch (direction)
+      {
+      case host_to_host:
+        assert(!mm.is_device_ptr(from_ptr) && !mm.is_device_ptr(to_ptr));
+        break;
+      case host_to_device:
+        assert(!mm.is_device_ptr(from_ptr) && mm.is_device_ptr(to_ptr));
+        break;
+      case device_to_host:
+        assert(mm.is_device_ptr(from_ptr) && !mm.is_device_ptr(to_ptr));
+        break;
+      case device_to_device:
+        assert(mm.is_device_ptr(from_ptr) && mm.is_device_ptr(to_ptr));
+        break;
+      case automatic:
+        bool from_device = mm.is_device_ptr(from_ptr);
+        bool to_device = mm.is_device_ptr(to_ptr);
+        if (from_device)
+        {
+          if (to_device)
+          {
+            real_direction = device_to_device;
+          }
+          else
+          {
+            real_direction = device_to_host;
+          }
+        }
+        else
+        {
+          if (to_device)
+          {
+            real_direction = host_to_device;
+          }
+          else
+          {
+            real_direction = host_to_host;
+          }
+        }
+        break;
+      }
+      bool is_cpu = q.get_device().is_cpu();
+
+      switch (real_direction)
+      {
+      case host_to_host:
+        std::memcpy(to_ptr, from_ptr, size);
+        return sycl::event();
+      case host_to_device:
+      {
+        auto alloc = mm.translate_ptr(to_ptr);
+        size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
+        if (is_cpu)
+        {
+          buffer_t from_buffer((byte_t *)from_ptr, sycl::range<1>(size), {sycl::property::buffer::use_host_ptr()});
+          return q.submit([&](sycl::handler &cgh)
+                          {
+        auto r = sycl::range<1>(size);
+        auto o = sycl::id<1>(offset);
+        auto from_acc = from_buffer.get_access<sycl::access_mode::read>(cgh);
+        sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                           sycl::access::target::global_buffer>
+            acc(alloc.buffer, cgh, r, o);
+        cgh.parallel_for<class memcopyh2d>(r, [=](sycl::id<1> idx) {
+          acc[idx] = from_acc[idx];
+          }); });
+        }
+        else
+        {
+          return q.submit([&](sycl::handler &cgh)
+                          {
+        auto r = sycl::range<1>(size);
+        auto o = sycl::id<1>(offset);
+         sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                           sycl::access::target::global_buffer>
+            acc(alloc.buffer, cgh, r, o);
+        cgh.copy(from_ptr, acc); });
+        }
+      }
+      case device_to_host:
+      {
+        auto alloc = mm.translate_ptr(from_ptr);
+        size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
+        if (is_cpu)
+        {
+          buffer_t to_buffer((byte_t *)to_ptr, sycl::range<1>(size), {sycl::property::buffer::use_host_ptr()});
+          return q.submit([&](sycl::handler &cgh)
+                          {
+        auto r = sycl::range<1>(size);
+        auto o = sycl::id<1>(offset);
+        auto to_acc = to_buffer.get_access<sycl::access_mode::write>(cgh);
+        sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                           sycl::access::target::global_buffer>
+            acc(alloc.buffer, cgh, r, o);
+        cgh.parallel_for<class memcopyd2h>(r, [=](sycl::id<1> idx) {
+          to_acc[idx] = acc[idx];
+          }); });
+        }
+        else
+        {
+          return q.submit([&](sycl::handler &cgh)
+                          {
+        auto r = sycl::range<1>(size);
+        auto o = sycl::id<1>(offset);
+        sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                           sycl::access::target::global_buffer>
+            acc(alloc.buffer, cgh, r, o);
+        cgh.copy(acc, to_ptr); });
+        }
+      }
+      case device_to_device:
+      {
+        auto to_alloc = mm.translate_ptr(to_ptr);
+        auto from_alloc = mm.translate_ptr(from_ptr);
+        size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
+        size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
+        if (is_cpu)
+        {
+          return q.submit([&](sycl::handler &cgh)
+                          {
+        auto r = sycl::range<1>(size);
+        auto to_o = sycl::id<1>(to_offset);
+        auto from_o = sycl::id<1>(from_offset);
+        sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                           sycl::access::target::global_buffer>
+            to_acc(to_alloc.buffer, cgh, r, to_o);
+        sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                           sycl::access::target::global_buffer>
+            from_acc(from_alloc.buffer, cgh, r, from_o);
+        cgh.parallel_for<class memcopyd2d>(r, [=](sycl::id<1> idx) {
+          to_acc[idx] = from_acc[idx];
+          }); });
+        }
+        else
+        {
+          return q.submit([&](sycl::handler &cgh)
+                          {
+        auto r = sycl::range<1>(size);
+        auto to_o = sycl::id<1>(to_offset);
+        auto from_o = sycl::id<1>(from_offset);
+        sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                           sycl::access::target::global_buffer>
+            to_acc(to_alloc.buffer, cgh, r, to_o);
+        sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                           sycl::access::target::global_buffer>
+            from_acc(from_alloc.buffer, cgh, r, from_o);
+        cgh.copy(from_acc, to_acc); });
+        }
+      }
+      default:
+        throw std::runtime_error("sift_memcpy: invalid direction value");
+      }
+#else
+      return q.memcpy(to_ptr, from_ptr, size);
+#endif
+    }
+
+    /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
+    /// and \p from_range to another specified by \p to_ptr and \p to_range.
+    static inline std::vector<sycl::event>
+    sift_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                sycl::range<3> to_range, sycl::range<3> from_range,
+                sycl::id<3> to_id, sycl::id<3> from_id,
+                sycl::range<3> size, memcpy_direction direction)
+    {
+      std::vector<sycl::event> event_list;
+
+      size_t to_slice = to_range.get(1) * to_range.get(0),
+             from_slice = from_range.get(1) * from_range.get(0);
+      unsigned char *to_surface = (unsigned char *)to_ptr +
+                                  to_id.get(2) * to_slice +
+                                  to_id.get(1) * to_range.get(0) + to_id.get(0);
+      const unsigned char *from_surface =
+          (const unsigned char *)from_ptr + from_id.get(2) * from_slice +
+          from_id.get(1) * from_range.get(0) + from_id.get(0);
+
+      if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
+      {
+        return {sift_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                            direction)};
+      }
+      for (size_t z = 0; z < size.get(2); ++z)
+      {
+        unsigned char *to_ptr = to_surface;
+        const unsigned char *from_ptr = from_surface;
+        if (to_range.get(0) == from_range.get(0) &&
+            to_range.get(0) == size.get(0))
+        {
+          event_list.push_back(sift_memcpy(q, to_ptr, from_ptr,
+                                           size.get(0) * size.get(1), direction));
+        }
+        else
+        {
+          for (size_t y = 0; y < size.get(1); ++y)
+          {
+            event_list.push_back(
+                sift_memcpy(q, to_ptr, from_ptr, size.get(0), direction));
+            to_ptr += to_range.get(0);
+            from_ptr += from_range.get(0);
+          }
+        }
+        to_surface += to_slice;
+        from_surface += from_slice;
+      }
+      return event_list;
+    }
+
+    /// memcpy 2D/3D matrix specified by pitched_data.
+    static inline std::vector<sycl::event>
+    sift_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+                memcpy_direction direction = automatic)
+    {
+      return sift_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
+                         sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                         sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                         size, direction);
+    }
+
+    /// memcpy 2D matrix with pitch.
+    static inline std::vector<sycl::event>
+    sift_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+                memcpy_direction direction = automatic)
+    {
+      return sift_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
+                         sycl::range<3>(from_pitch, y, 1),
+                         sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                         sycl::range<3>(x, y, 1), direction);
+    }
+  } // namespace detail
+
+#ifdef INFRA_USM_LEVEL_NONE
+  /// Check if the pointer \p ptr represents device pointer or not.
+  ///
+  /// \param ptr The pointer to be checked.
+  /// \returns true if \p ptr is a device pointer.
+  template <class T>
+  static inline bool is_device_ptr(T ptr)
+  {
+    if constexpr (std::is_pointer<T>::value)
+    {
+      return detail::mem_mgr::instance().is_device_ptr(ptr);
+    }
+    return false;
+  }
+#endif
+
+  /// Get the buffer and the offset of a piece of memory pointed to by \p ptr.
+  ///
+  /// \param ptr Pointer to a piece of memory.
+  /// If NULL is passed as an argument, an exception will be thrown.
+  /// \returns a pair containing both the buffer and the offset.
+  static std::pair<buffer_t, size_t> get_buffer_and_offset(const void *ptr)
+  {
+    if (ptr)
+    {
+      auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+      size_t offset = (byte_t *)ptr - alloc.alloc_ptr;
+      return std::make_pair(alloc.buffer, offset);
+    }
+    else
+    {
+      throw std::runtime_error(
+          "NULL pointer argument in get_buffer_and_offset function is invalid");
+    }
+  }
+
+  /// Get the data pointed from \p ptr as a 1D buffer reinterpreted as type T.
+  template <typename T>
+  static sycl::buffer<T> get_buffer(const void *ptr)
+  {
+    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+    return alloc.buffer.reinterpret<T>(
+        sycl::range<1>(alloc.size / sizeof(T)));
+  }
+
+  /// Get the buffer of a piece of memory pointed to by \p ptr.
+  ///
+  /// \param ptr Pointer to a piece of memory.
+  /// \returns the buffer.
+  static buffer_t get_buffer(const void *ptr)
+  {
+    return detail::mem_mgr::instance().translate_ptr(ptr).buffer;
+  }
+
+  /// A wrapper class contains an accessor and an offset.
+  template <typename dataT,
+            sycl::access_mode accessMode = sycl::access_mode::read_write>
+  class access_wrapper
+  {
+    sycl::accessor<byte_t, 1, accessMode> accessor;
+    size_t offset;
+
+  public:
+    /// Construct the accessor wrapper for memory pointed by \p ptr.
+    ///
+    /// \param ptr Pointer to memory.
+    /// \param cgh The command group handler.
+    access_wrapper(const void *ptr, sycl::handler &cgh)
+        : accessor(get_buffer(ptr).get_access<accessMode>(cgh)), offset(0)
+    {
+      auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+      offset = (byte_t *)ptr - alloc.alloc_ptr;
+    }
+
+    /// Get the device pointer.
+    ///
+    /// \returns a device pointer with offset.
+    dataT get_raw_pointer() const { return (dataT)(&accessor[0] + offset); }
+  };
+
+  /// Get the accessor for memory pointed by \p ptr.
+  ///
+  /// \param ptr Pointer to memory.
+  /// If NULL is passed as an argument, an exception will be thrown.
+  /// \param cgh The command group handler.
+  /// \returns an accessor.
+  template <sycl::access_mode accessMode = sycl::access_mode::read_write>
+  static sycl::accessor<byte_t, 1, accessMode>
+  get_access(const void *ptr, sycl::handler &cgh)
+  {
+    if (ptr)
+    {
+      auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+      return alloc.buffer.get_access<accessMode>(cgh);
+    }
+    else
+    {
+      throw std::runtime_error(
+          "NULL pointer argument in get_access function is invalid");
+    }
+  }
+
+  /// Allocate memory block on the device.
+  /// \param num_bytes Number of bytes to allocate.
+  /// \param q Queue to execute the allocate task.
+  /// \returns A pointer to the newly allocated memory.
+  template <typename T>
+  static inline void *sift_malloc(T num_bytes,
+                                  sycl::queue &q = get_default_queue())
+  {
+    return detail::sift_malloc(static_cast<size_t>(num_bytes), q);
+  }
+
+  /// Get the host pointer from a buffer that is mapped to virtual pointer ptr.
+  /// \param ptr Virtual Pointer mapped to device buffer
+  /// \returns A host pointer
+  template <typename T>
+  static inline T *get_host_ptr(const void *ptr)
+  {
+    auto BufferOffset = get_buffer_and_offset(ptr);
+    auto host_ptr =
+        BufferOffset.first.get_access<sycl::access_mode::read_write>()
+            .get_pointer();
+    return (T *)(host_ptr + BufferOffset.second);
+  }
+
+  /// Allocate memory block for 3D array on the device.
+  /// \param size Size of of the memory block, in bytes.
+  /// \param q Queue to execute the allocate task.
+  /// \returns A pitched_data object which stores the memory info.
+  static inline pitched_data
+  sift_malloc(sycl::range<3> size, sycl::queue &q = get_default_queue())
+  {
+    pitched_data pitch(nullptr, 0, size.get(0), size.get(1));
+    size_t pitch_size;
+    pitch.set_data_ptr(detail::sift_malloc(pitch_size, size.get(0), size.get(1),
+                                           size.get(2), q));
+    pitch.set_pitch(pitch_size);
+    return pitch;
+  }
+
+  /// Allocate memory block for 2D array on the device.
+  /// \param [out] pitch Aligned size of x in bytes.
+  /// \param x Range in dim x.
+  /// \param y Range in dim y.
+  /// \param q Queue to execute the allocate task.
+  /// \returns A pointer to the newly allocated memory.
+  static inline void *sift_malloc(size_t &pitch, size_t x, size_t y,
+                                  sycl::queue &q = get_default_queue())
+  {
+    return detail::sift_malloc(pitch, x, y, 1, q);
+  }
+
+  /// free
+  /// \param ptr Point to free.
+  /// \param q Queue to execute the free task.
+  /// \returns no return value.
+  static inline void infra_free(void *ptr,
+                                sycl::queue &q = get_default_queue())
+  {
+    if (ptr)
+    {
+#ifdef INFRA_USM_LEVEL_NONE
+      detail::mem_mgr::instance().mem_free(ptr);
+#else
+      sycl::free(ptr, q.get_context());
+#endif
+    }
+  }
+
+#ifndef INFRA_USM_LEVEL_NONE
+  /// Free the device memory pointed by a batch of pointers in \p pointers which
+  /// are related to \p q after \p events completed.
+  ///
+  /// \param pointers The pointers point to the device memory requested to be freed.
+  /// \param events The events to be waited.
+  /// \param q The sycl::queue the memory relates to.
+  inline void async_infra_free(std::vector<void *> pointers,
+                               std::vector<sycl::event> events,
+                               sycl::queue &q = get_default_queue())
+  {
+    std::thread t(
+        [](std::vector<void *> pointers, std::vector<sycl::event> events,
+           sycl::context ctxt)
+        {
+          sycl::event::wait(events);
+          for (auto p : pointers)
+            sycl::free(p, ctxt);
+        },
+        std::move(pointers), std::move(events), q.get_context());
+    get_current_device().add_task(std::move(t));
+  }
+#endif
+
+  /// Synchronously copies \p size bytes from the address specified by \p from_ptr
+  /// to the address specified by \p to_ptr. The value of \p direction is used to
+  /// set the copy direction, it can be \a host_to_host, \a host_to_device,
+  /// \a device_to_host, \a device_to_device or \a automatic. The function will
+  /// return after the copy is completed.
+  ///
+  /// \param to_ptr Pointer to destination memory address.
+  /// \param from_ptr Pointer to source memory address.
+  /// \param size Number of bytes to be copied.
+  /// \param direction Direction of the copy.
+  /// \param q Queue to execute the copy task.
+  /// \returns no return value.
+  static void sift_memcpy(void *to_ptr, const void *from_ptr, size_t size,
+                          memcpy_direction direction = automatic,
+                          sycl::queue &q = get_default_queue())
+  {
+    detail::sift_memcpy(q, to_ptr, from_ptr, size, direction).wait();
+  }
+
+  /// Asynchronously copies \p size bytes from the address specified by \p
+  /// from_ptr to the address specified by \p to_ptr. The value of \p direction is
+  /// used to set the copy direction, it can be \a host_to_host, \a
+  /// host_to_device, \a device_to_host, \a device_to_device or \a automatic. The
+  /// return of the function does NOT guarantee the copy is completed.
+  ///
+  /// \param to_ptr Pointer to destination memory address.
+  /// \param from_ptr Pointer to source memory address.
+  /// \param size Number of bytes to be copied.
+  /// \param direction Direction of the copy.
+  /// \param q Queue to execute the copy task.
+  /// \returns no return value.
+  static void async_sift_memcpy(void *to_ptr, const void *from_ptr, size_t size,
+                                memcpy_direction direction = automatic,
+                                sycl::queue &q = infra::get_default_queue())
+  {
+    detail::sift_memcpy(q, to_ptr, from_ptr, size, direction);
+  }
+
+  /// Synchronously copies 2D matrix specified by \p x and \p y from the address
+  /// specified by \p from_ptr to the address specified by \p to_ptr, while \p
+  /// from_pitch and \p to_pitch are the range of dim x in bytes of the matrix
+  /// specified by \p from_ptr and \p to_ptr. The value of \p direction is used to
+  /// set the copy direction, it can be \a host_to_host, \a host_to_device, \a
+  /// device_to_host, \a device_to_device or \a automatic. The function will
+  /// return after the copy is completed.
+  ///
+  /// \param to_ptr Pointer to destination memory address.
+  /// \param to_pitch Range of dim x in bytes of destination matrix.
+  /// \param from_ptr Pointer to source memory address.
+  /// \param from_pitch Range of dim x in bytes of source matrix.
+  /// \param x Range of dim x of matrix to be copied.
+  /// \param y Range of dim y of matrix to be copied.
+  /// \param direction Direction of the copy.
+  /// \param q Queue to execute the copy task.
+  /// \returns no return value.
+  static inline void sift_memcpy(void *to_ptr, size_t to_pitch,
+                                 const void *from_ptr, size_t from_pitch,
+                                 size_t x, size_t y,
+                                 memcpy_direction direction = automatic,
+                                 sycl::queue &q = infra::get_default_queue())
+  {
+    sycl::event::wait(detail::sift_memcpy(q, to_ptr, from_ptr, to_pitch,
+                                          from_pitch, x, y, direction));
+  }
+
+  /// Asynchronously copies 2D matrix specified by \p x and \p y from the address
+  /// specified by \p from_ptr to the address specified by \p to_ptr, while \p
+  /// \p from_pitch and \p to_pitch are the range of dim x in bytes of the matrix
+  /// specified by \p from_ptr and \p to_ptr. The value of \p direction is used to
+  /// set the copy direction, it can be \a host_to_host, \a host_to_device, \a
+  /// device_to_host, \a device_to_device or \a automatic. The return of the
+  /// function does NOT guarantee the copy is completed.
+  ///
+  /// \param to_ptr Pointer to destination memory address.
+  /// \param to_pitch Range of dim x in bytes of destination matrix.
+  /// \param from_ptr Pointer to source memory address.
+  /// \param from_pitch Range of dim x in bytes of source matrix.
+  /// \param x Range of dim x of matrix to be copied.
+  /// \param y Range of dim y of matrix to be copied.
+  /// \param direction Direction of the copy.
+  /// \param q Queue to execute the copy task.
+  /// \returns no return value.
+  static inline void
+  async_sift_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
+                    size_t from_pitch, size_t x, size_t y,
+                    memcpy_direction direction = automatic,
+                    sycl::queue &q = get_default_queue())
+  {
+    detail::sift_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
+                        direction);
+  }
+
+  /// Synchronously copies a subset of a 3D matrix specified by \p to to another
+  /// 3D matrix specified by \p from. The from and to position info are specified
+  /// by \p from_pos and \p to_pos The copied matrix size is specfied by \p size.
+  /// The value of \p direction is used to set the copy direction, it can be \a
+  /// host_to_host, \a host_to_device, \a device_to_host, \a device_to_device or
+  /// \a automatic. The function will return after the copy is completed.
+  ///
+  /// \param to Destination matrix info.
+  /// \param to_pos Position of destination.
+  /// \param from Source matrix info.
+  /// \param from_pos Position of destination.
+  /// \param size Range of the submatrix to be copied.
+  /// \param direction Direction of the copy.
+  /// \param q Queue to execute the copy task.
+  /// \returns no return value.
+  static inline void sift_memcpy(pitched_data to, sycl::id<3> to_pos,
+                                 pitched_data from, sycl::id<3> from_pos,
+                                 sycl::range<3> size,
+                                 memcpy_direction direction = automatic,
+                                 sycl::queue &q = infra::get_default_queue())
+  {
+    sycl::event::wait(
+        detail::sift_memcpy(q, to, to_pos, from, from_pos, size, direction));
+  }
+
+  /// Asynchronously copies a subset of a 3D matrix specified by \p to to another
+  /// 3D matrix specified by \p from. The from and to position info are specified
+  /// by \p from_pos and \p to_pos The copied matrix size is specfied by \p size.
+  /// The value of \p direction is used to set the copy direction, it can be \a
+  /// host_to_host, \a host_to_device, \a device_to_host, \a device_to_device or
+  /// \a automatic. The return of the function does NOT guarantee the copy is
+  /// completed.
+  ///
+  /// \param to Destination matrix info.
+  /// \param to_pos Position of destination.
+  /// \param from Source matrix info.
+  /// \param from_pos Position of destination.
+  /// \param size Range of the submatrix to be copied.
+  /// \param direction Direction of the copy.
+  /// \param q Queue to execute the copy task.
+  /// \returns no return value.
+  static inline void
+  async_sift_memcpy(pitched_data to, sycl::id<3> to_pos, pitched_data from,
+                    sycl::id<3> from_pos, sycl::range<3> size,
+                    memcpy_direction direction = automatic,
+                    sycl::queue &q = get_default_queue())
+  {
+    detail::sift_memcpy(q, to, to_pos, from, from_pos, size, direction);
+  }
+
+  /// Synchronously sets \p value to the first \p size bytes starting from \p
+  /// dev_ptr. The function will return after the memset operation is completed.
+  ///
+  /// \param dev_ptr Pointer to the device memory address.
+  /// \param value Value to be set.
+  /// \param size Number of bytes to be set to the value.
+  /// \param q The queue in which the operation is done.
+  /// \returns no return value.
+  static void sift_memset(void *dev_ptr, int value, size_t size,
+                          sycl::queue &q = get_default_queue())
+  {
+    detail::sift_memset(q, dev_ptr, value, size).wait();
+  }
+
+  /// Asynchronously sets \p value to the first \p size bytes starting from \p
+  /// dev_ptr. The return of the function does NOT guarantee the memset operation
+  /// is completed.
+  ///
+  /// \param dev_ptr Pointer to the device memory address.
+  /// \param value Value to be set.
+  /// \param size Number of bytes to be set to the value.
+  /// \returns no return value.
+  static void async_sift_memset(void *dev_ptr, int value, size_t size,
+                                sycl::queue &q = infra::get_default_queue())
+  {
+    detail::sift_memset(q, dev_ptr, value, size);
+  }
+
+  /// Sets \p value to the 2D memory region pointed by \p ptr in \p q. \p x and
+  /// \p y specify the setted 2D memory size. \p pitch is the bytes in linear
+  /// dimension, including padding bytes. The function will return after the
+  /// memset operation is completed.
+  ///
+  /// \param ptr Pointer to the device memory region.
+  /// \param pitch Bytes in linear dimension, including padding bytes.
+  /// \param value Value to be set.
+  /// \param x The setted memory size in linear dimension.
+  /// \param y The setted memory size in second dimension.
+  /// \param q The queue in which the operation is done.
+  /// \returns no return value.
+  static inline void sift_memset(void *ptr, size_t pitch, int val, size_t x,
+                                 size_t y,
+                                 sycl::queue &q = get_default_queue())
+  {
+    sycl::event::wait(detail::sift_memset(q, ptr, pitch, val, x, y));
+  }
+
+  /// Sets \p value to the 2D memory region pointed by \p ptr in \p q. \p x and
+  /// \p y specify the setted 2D memory size. \p pitch is the bytes in linear
+  /// dimension, including padding bytes. The return of the function does NOT
+  /// guarantee the memset operation is completed.
+  ///
+  /// \param ptr Pointer to the device memory region.
+  /// \param pitch Bytes in linear dimension, including padding bytes.
+  /// \param value Value to be set.
+  /// \param x The setted memory size in linear dimension.
+  /// \param y The setted memory size in second dimension.
+  /// \param q The queue in which the operation is done.
+  /// \returns no return value.
+  static inline void async_sift_memset(void *ptr, size_t pitch, int val, size_t x,
+                                       size_t y,
+                                       sycl::queue &q = get_default_queue())
+  {
+    detail::sift_memset(q, ptr, pitch, val, x, y);
+  }
+
+  /// Sets \p value to the 3D memory region specified by \p pitch in \p q. \p size
+  /// specify the setted 3D memory size. The function will return after the
+  /// memset operation is completed.
+  ///
+  /// \param pitch Specify the 3D memory region.
+  /// \param value Value to be set.
+  /// \param size The setted 3D memory size.
+  /// \param q The queue in which the operation is done.
+  /// \returns no return value.
+  static inline void sift_memset(pitched_data pitch, int val,
+                                 sycl::range<3> size,
+                                 sycl::queue &q = get_default_queue())
+  {
+    sycl::event::wait(detail::sift_memset(q, pitch, val, size));
+  }
+
+  /// Sets \p value to the 3D memory region specified by \p pitch in \p q. \p size
+  /// specify the setted 3D memory size. The return of the function does NOT
+  /// guarantee the memset operation is completed.
+  ///
+  /// \param pitch Specify the 3D memory region.
+  /// \param value Value to be set.
+  /// \param size The setted 3D memory size.
+  /// \param q The queue in which the operation is done.
+  /// \returns no return value.
+  static inline void async_sift_memset(pitched_data pitch, int val,
+                                       sycl::range<3> size,
+                                       sycl::queue &q = get_default_queue())
+  {
+    detail::sift_memset(q, pitch, val, size);
+  }
+
+  template <class T, memory_region Memory, size_t Dimension>
+  class accessor;
+  template <class T, memory_region Memory>
+  class accessor<T, Memory, 3>
+  {
+  public:
+    using memory_t = detail::memory_traits<Memory, T>;
+    using element_t = typename memory_t::element_t;
+    using pointer_t = typename memory_t::pointer_t;
+    using accessor_t = typename memory_t::template accessor_t<3>;
+    accessor(pointer_t data, const sycl::range<3> &in_range)
+        : _data(data), _range(in_range) {}
+    template <memory_region M = Memory>
+    accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
+        : accessor(acc, acc.get_range()) {}
+    accessor(const accessor_t &acc, const sycl::range<3> &in_range)
+        : accessor(acc.get_pointer(), in_range) {}
+    accessor<T, Memory, 2> operator[](size_t index) const
+    {
+      sycl::range<2> sub(_range.get(1), _range.get(2));
+      return accessor<T, Memory, 2>(_data + index * sub.size(), sub);
+    }
+
+  private:
+    pointer_t _data;
+    sycl::range<3> _range;
+  };
+  template <class T, memory_region Memory>
+  class accessor<T, Memory, 2>
+  {
+  public:
+    using memory_t = detail::memory_traits<Memory, T>;
+    using element_t = typename memory_t::element_t;
+    using pointer_t = typename memory_t::pointer_t;
+    using accessor_t = typename memory_t::template accessor_t<2>;
+    accessor(pointer_t data, const sycl::range<2> &in_range)
+        : _data(data), _range(in_range) {}
+    template <memory_region M = Memory>
+    accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
+        : accessor(acc, acc.get_range()) {}
+    accessor(const accessor_t &acc, const sycl::range<2> &in_range)
+        : accessor(acc.get_pointer(), in_range) {}
+
+    pointer_t operator[](size_t index) const
+    {
+      return _data + _range.get(1) * index;
+    }
+
+  private:
+    pointer_t _data;
+    sycl::range<2> _range;
+  };
+
+  namespace detail
+  {
+    /// Device variable with address space of shared, global or constant.
+    template <class T, memory_region Memory, size_t Dimension>
+    class device_memory
+    {
+    public:
+      using accessor_t =
+          typename detail::memory_traits<Memory, T>::template accessor_t<Dimension>;
+      using value_t = typename detail::memory_traits<Memory, T>::value_t;
+      using infra_accessor_t = infra::accessor<T, Memory, Dimension>;
+
+      device_memory() : device_memory(sycl::range<Dimension>(1)) {}
+
+      /// Constructor of 1-D array with initializer list
+      template <size_t D = Dimension>
+      device_memory(
+          const typename std::enable_if<D == 1, sycl::range<1>>::type &in_range,
+          std::initializer_list<value_t> &&init_list)
+          : device_memory(in_range)
+      {
+        assert(init_list.size() <= in_range.size());
+        _host_ptr = (value_t *)std::malloc(_size);
+        std::memset(_host_ptr, 0, _size);
+        std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
+      }
+
+      /// Constructor of 2-D array with initializer list
+      template <size_t D = Dimension>
+      device_memory(
+          const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range,
+          std::initializer_list<std::initializer_list<value_t>> &&init_list)
+          : device_memory(in_range)
+      {
+        assert(init_list.size() <= in_range[0]);
+        _host_ptr = (value_t *)std::malloc(_size);
+        std::memset(_host_ptr, 0, _size);
+        auto tmp_data = _host_ptr;
+        for (auto sub_list : init_list)
+        {
+          assert(sub_list.size() <= in_range[1]);
+          std::memcpy(tmp_data, sub_list.begin(), sub_list.size() * sizeof(T));
+          tmp_data += in_range[1];
+        }
+      }
+
+      /// Constructor with range
+      device_memory(const sycl::range<Dimension> &range_in)
+          : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false),
+            _host_ptr(nullptr), _device_ptr(nullptr)
+      {
+        static_assert(
+            (Memory == global) || (Memory == constant) || (Memory == shared),
+            "device memory region should be global, constant or shared");
+        // Make sure that singleton class mem_mgr and dev_mgr will destruct later
+        // than this.
+        detail::mem_mgr::instance();
+        dev_mgr::instance();
+      }
+
+      /// Constructor with range
+      template <class... Args>
+      device_memory(Args... Arguments)
+          : device_memory(sycl::range<Dimension>(Arguments...)) {}
+
+      device_memory(const device_memory &) = delete;
+      device_memory &operator=(const device_memory &) = delete;
+      ~device_memory()
+      {
+        if (_device_ptr && !_reference)
+        {
+          try
+          {
+            infra_free(_device_ptr);
+          }
+          catch (std::exception const &e)
+          {
+            std::cerr << e.what() << '\n';
+          }
+        }
+        if (_host_ptr)
+          std::free(_host_ptr);
+      }
+
+      /// Allocate memory with default queue, and init memory if has initial value.
+      void init()
+      {
+        init(infra::get_default_queue());
+      }
+      /// Allocate memory with specficed queue, and init memory if has initial value.
+      void init(sycl::queue &q)
+      {
+        if (_device_ptr)
+          return;
+        if (!_size)
+          return;
+        allocate_device(q);
+        if (_host_ptr)
+          detail::sift_memcpy(q, _device_ptr, _host_ptr, _size, host_to_device);
+      }
+
+      /// The variable is assigned to a device pointer.
+      void assign(value_t *src, size_t size)
+      {
+        this->~device_memory();
+        new (this) device_memory(src, size);
+      }
+
+      /// Get memory pointer of the memory object, which is virtual pointer when
+      /// usm is not used, and device pointer when usm is used .
+      value_t *get_ptr()
+      {
+        return get_ptr(get_default_queue());
+      }
+      /// Get memory pointer of the memory object, which is virtual pointer when
+      /// usm is not used, and device pointer when usm is used .
+      value_t *get_ptr(sycl::queue &q)
+      {
+        init(q);
+        return _device_ptr;
+      }
+
+      /// Get the device memory object size in bytes.
+      size_t get_size() { return _size; }
+
+      template <size_t D = Dimension>
+      typename std::enable_if<D == 1, T>::type &operator[](size_t index)
+      {
+        init();
+#ifdef INFRA_USM_LEVEL_NONE
+        return infra::get_buffer<typename std::enable_if<D == 1, T>::type>(
+                   _device_ptr)
+            .template get_access<sycl::access_mode::read_write>()[index];
+#else
+        return _device_ptr[index];
+#endif
+      }
+
+#ifdef INFRA_USM_LEVEL_NONE
+      /// Get sycl::accessor for the device memory object when usm is not used.
+      accessor_t get_access(sycl::handler &cgh)
+      {
+        return get_buffer(_device_ptr)
+            .template reinterpret<T, Dimension>(_range)
+            .template get_access<detail::memory_traits<Memory, T>::mode,
+                                 detail::memory_traits<Memory, T>::target>(cgh);
+      }
+#else
+      /// Get infra::accessor with dimension info for the device memory object
+      /// when usm is used and dimension is greater than 1.
+      template <size_t D = Dimension>
+      typename std::enable_if<D != 1, infra_accessor_t>::type
+      get_access(sycl::handler &cgh)
+      {
+        return infra_accessor_t((T *)_device_ptr, _range);
+      }
+#endif
+
+    private:
+      device_memory(value_t *memory_ptr, size_t size)
+          : _size(size), _range(size / sizeof(T)), _reference(true),
+            _device_ptr(memory_ptr) {}
+
+      void allocate_device(sycl::queue &q)
+      {
+#ifndef INFRA_USM_LEVEL_NONE
+        if (Memory == shared)
+        {
+          _device_ptr = (value_t *)sycl::malloc_shared(
+              _size, q.get_device(), q.get_context());
+          return;
+        }
+#endif
+        _device_ptr = (value_t *)detail::sift_malloc(_size, q);
+      }
+
+      size_t _size;
+      sycl::range<Dimension> _range;
+      bool _reference;
+      value_t *_host_ptr;
+      value_t *_device_ptr;
+    };
+    template <class T, memory_region Memory>
+    class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1>
+    {
+    public:
+      using base = device_memory<T, Memory, 1>;
+      using value_t = typename base::value_t;
+      using accessor_t =
+          typename detail::memory_traits<Memory, T>::template accessor_t<0>;
+
+      /// Constructor with initial value.
+      device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {}
+
+      /// Default constructor
+      device_memory() : base(1) {}
+
+#ifdef INFRA_USM_LEVEL_NONE
+      /// Get sycl::accessor for the device memory object when usm is not used.
+      accessor_t get_access(sycl::handler &cgh)
+      {
+        auto buf = get_buffer(base::get_ptr())
+                       .template reinterpret<T, 1>(sycl::range<1>(1));
+        return accessor_t(buf, cgh);
+      }
+#endif
+    };
+  }
+
+  template <class T, size_t Dimension>
+  using global_memory = detail::device_memory<T, global, Dimension>;
+  template <class T, size_t Dimension>
+  using constant_memory = detail::device_memory<T, constant, Dimension>;
+  template <class T, size_t Dimension>
+  using shared_memory = detail::device_memory<T, shared, Dimension>;
+}
+
+#endif
diff --git a/src/mainSift.cpp b/src/mainSift.cpp
new file mode 100644
index 0000000..49ae4c7
--- /dev/null
+++ b/src/mainSift.cpp
@@ -0,0 +1,313 @@
+//********************************************************//
+// CUDA SIFT extractor by Marten Björkman aka Celebrandil //
+//              celle @ csc.kth.se                       //
+//********************************************************//
+
+// Modifications Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+
+#include <sycl/sycl.hpp>
+#include <iostream>
+#include <cmath>
+#include <iomanip>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "cudaImage.h"
+#include "cudaSift.h"
+#include "infra/infra.hpp"
+#include "Utility.h"
+
+#ifndef KERNEL_USE_PROFILE
+#define KERNEL_USE_PROFILE 0
+#endif
+
+void copyData(void *host, void *dev, size_t size);
+int ImproveHomography(SiftData &data, float *homography, int numLoops, float minScore, float maxAmbiguity, float thresh);
+void PrintMatchData(SiftData &siftData1, SiftData &siftData2, CudaImage &img);
+void MatchAll(SiftData &siftData1, SiftData &siftData2, float *homography);
+
+double ScaleUp(CudaImage &res, CudaImage &src);
+
+///////////////////////////////////////////////////////////////////////////////
+// Main program
+///////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+  auto totalProgTimer_start = std::chrono::steady_clock::now();
+  int devNum = 0, imgSet = 0;
+  if (argc > 1)
+    devNum = std::atoi(argv[1]);
+  if (argc > 2)
+    imgSet = std::atoi(argv[2]);
+
+  float totTime = 0.0;
+  float imageInitTime = 0.0;
+  float extractSiftTime = 0.0;
+  float matchingTime = 0.0;
+
+  sycl::device dev = sycl::device(sycl::gpu_selector());
+  sycl::property_list q_prop{sycl::property::queue::in_order()};
+
+#ifdef DEVICE_TIMER
+  auto q_time_start = std::chrono::steady_clock::now();
+#endif
+  sycl::queue q_ct(dev, q_prop);
+#ifdef DEVICE_TIMER
+  auto q_time_stop = std::chrono::steady_clock::now();
+  // std::cout << "Queue creation Time is " << std::chrono::duration<float, std::micro>(q_time_stop - q_time_start).count() << " us" << std::endl;
+  imageInitTime += std::chrono::duration<float, std::micro>(q_time_stop - q_time_start).count();
+#endif
+
+  // Read images using OpenCV
+  cv::Mat limg, rimg;
+  auto ioRead_start = std::chrono::steady_clock::now();
+  if (imgSet)
+  {
+    cv::imread("../../data/left.pgm", 0).convertTo(limg, CV_32FC1);
+    cv::imread("../../data/righ.pgm", 0).convertTo(rimg, CV_32FC1);
+  }
+  else
+  {
+    cv::imread("../../data/img1.png", 0).convertTo(limg, CV_32FC1);
+    cv::imread("../../data/img2.png", 0).convertTo(rimg, CV_32FC1);
+  }
+  auto ioRead_stop = std::chrono::steady_clock::now();
+  float ioReadTime = std::chrono::duration<float, std::micro>(ioRead_stop - ioRead_start).count();
+  unsigned int w = limg.cols;
+  unsigned int h = limg.rows;
+  std::cout << "Image size = (" << w << "," << h << ")" << std::endl;
+
+  // Initial Cuda images and download images to device
+  std::cout << "Initializing data..." << std::endl;
+  CudaImage img1, img2;
+
+  img1.Allocate(w, h, iAlignUp(w, 128), false, q_ct, imageInitTime, NULL, (float *)limg.data);
+  img2.Allocate(w, h, iAlignUp(w, 128), false, q_ct, imageInitTime, NULL, (float *)rimg.data);
+  // std::cout << "Img Allocate time " << totTime << std::endl;
+  try
+  {
+    img1.Download(q_ct, imageInitTime);
+    img2.Download(q_ct, imageInitTime);
+  }
+  catch (sycl::exception const &e)
+  {
+    std::cerr << e.what() << '\n';
+  }
+  // std::cout << "Img Download time " << totTime << std::endl;
+
+  // Extract Sift features from images
+  SiftData siftData1, siftData2;
+  float initBlur = 1.0f;
+  float thresh = (imgSet ? 4.5f : 2.0f);
+  InitSiftData(siftData1, q_ct, imageInitTime, 32768, true, true);
+  InitSiftData(siftData2, q_ct, imageInitTime, 32768, true, true);
+
+  // A bit of benchmarking
+  // for (int thresh1=1.00f;thresh1<=4.01f;thresh1+=0.50f) {
+  float *memoryTmp = AllocSiftTempMemory(w, h, 5, q_ct, imageInitTime, false);
+  for (int i = 0; i < 50; i++)
+  {
+    float time = 0.0;
+    try
+    {
+      ExtractSift(siftData1, img1, 5, initBlur, thresh, q_ct, time, 0.0f, false, memoryTmp);
+      extractSiftTime += time;
+      time = 0.0;
+      ExtractSift(siftData2, img2, 5, initBlur, thresh, q_ct, time, 0.0f, false, memoryTmp);
+    }
+    catch (std::exception const &e)
+    {
+      std::cerr << e.what() << '\n';
+    }
+    extractSiftTime += time;
+  }
+  FreeSiftTempMemory(memoryTmp, q_ct);
+
+  // Match Sift features and find a homography
+  for (int i = 0; i < 1; i++)
+    MatchSiftData(siftData1, siftData2, q_ct, matchingTime);
+  float homography[9];
+  int numMatches;
+  try
+  {
+    FindHomography(siftData1, homography, &numMatches, q_ct, matchingTime, 10000, 0.0f, 0.80f, 5.0);
+  }
+  catch (std::exception const &e)
+  {
+    std::cerr << e.what() << '\n';
+  }
+  int numFit = ImproveHomography(siftData1, homography, 5, 0.00f, 0.80f, 3.0);
+  float matchPercentage = 100.0f * numFit / std::min(siftData1.numPts, siftData2.numPts);
+
+  std::cout << "Number of original features: " << siftData1.numPts << " " << siftData2.numPts << std::endl;
+  std::cout << "Number of matching features: " << numFit << " " << numMatches << " " << matchPercentage << "% " << initBlur << " " << thresh << "\n"
+            << std::endl;
+
+#ifdef DEVICE_TIMER
+  totTime = imageInitTime + extractSiftTime + matchingTime;
+  std::cout << "Images initialization time = " << imageInitTime / 1000 << " ms" << std::endl;
+  std::cout << "Feature extraction time = " << extractSiftTime / 1000 << " ms" << std::endl;
+  std::cout << "Matching time = " << matchingTime / 1000 << " ms"
+            << "\n"
+            << std::endl;
+  std::cout << "Total Device Time = " << totTime / 1000 << " ms"
+            << "\n"
+            << std::endl;
+#endif
+  // data validation
+  auto dataVerficationTimer_start = std::chrono::steady_clock::now();
+  Utility::RunDataVerification(thresh, matchPercentage);
+  auto dataVerficationTimer_stop = std::chrono::steady_clock::now();
+  float dataVerificationTime =
+      std::chrono::duration<float, std::micro>(dataVerficationTimer_stop - dataVerficationTimer_start).count();
+  // Print out and store summary data
+  // PrintMatchData(siftData1, siftData2, img1);
+  // cv::imwrite("../../data/limg_pts.pgm", limg);
+
+  // MatchAll(siftData1, siftData2, homography);
+
+  // Free Sift data from device
+  FreeSiftData(siftData1, q_ct);
+  FreeSiftData(siftData2, q_ct);
+
+  auto totalProgTimer_end = std::chrono::steady_clock::now();
+  float totalProgramTime = std::chrono::duration<float, std::micro>(totalProgTimer_end - totalProgTimer_start).count() - ioReadTime - dataVerificationTime;
+  std::cout << "Total workload time = " << totalProgramTime / 1000 << " ms"
+            << "\n"
+            << std::endl;
+  return 0;
+}
+
+void MatchAll(SiftData &siftData1, SiftData &siftData2, float *homography)
+{
+#ifdef MANAGEDMEM
+  SiftPoint *sift1 = siftData1.m_data;
+  SiftPoint *sift2 = siftData2.m_data;
+#else
+  SiftPoint *sift1 = siftData1.h_data;
+  SiftPoint *sift2 = siftData2.h_data;
+#endif
+  int numPts1 = siftData1.numPts;
+  int numPts2 = siftData2.numPts;
+  int numFound = 0;
+#if 1
+  homography[0] = homography[4] = -1.0f;
+  homography[1] = homography[3] = homography[6] = homography[7] = 0.0f;
+  homography[2] = 1279.0f;
+  homography[5] = 959.0f;
+#endif
+  for (int i = 0; i < numPts1; i++)
+  {
+    float *data1 = sift1[i].data;
+    std::cout << i << ":" << sift1[i].scale << ":" << (int)sift1[i].orientation << " " << sift1[i].xpos << " " << sift1[i].ypos << std::endl;
+    bool found = false;
+    for (int j = 0; j < numPts2; j++)
+    {
+      float *data2 = sift2[j].data;
+      float sum = 0.0f;
+      for (int k = 0; k < 128; k++)
+        sum += data1[k] * data2[k];
+      float den = homography[6] * sift1[i].xpos + homography[7] * sift1[i].ypos + homography[8];
+      float dx = (homography[0] * sift1[i].xpos + homography[1] * sift1[i].ypos + homography[2]) / den - sift2[j].xpos;
+      float dy = (homography[3] * sift1[i].xpos + homography[4] * sift1[i].ypos + homography[5]) / den - sift2[j].ypos;
+      float err = dx * dx + dy * dy;
+      if (err < 100.0f) // 100.0
+        found = true;
+      if (err < 100.0f || j == sift1[i].match)
+      { // 100.0
+        if (j == sift1[i].match && err < 100.0f)
+          std::cout << " *";
+        else if (j == sift1[i].match)
+          std::cout << " -";
+        else if (err < 100.0f)
+          std::cout << " +";
+        else
+          std::cout << "  ";
+        std::cout << j << ":" << sum << ":" << (int)sqrt(err) << ":" << sift2[j].scale << ":" << (int)sift2[j].orientation << " " << sift2[j].xpos << " " << sift2[j].ypos << " " << (int)dx << " " << (int)dy << std::endl;
+      }
+    }
+    std::cout << std::endl;
+    if (found)
+      numFound++;
+  }
+  std::cout << "Number of finds: " << numFound << " / " << numPts1 << std::endl;
+  std::cout << homography[0] << " " << homography[1] << " " << homography[2] << std::endl; //%%%
+  std::cout << homography[3] << " " << homography[4] << " " << homography[5] << std::endl; //%%%
+  std::cout << homography[6] << " " << homography[7] << " " << homography[8] << std::endl; //%%%
+}
+
+void PrintMatchData(SiftData &siftData1, SiftData &siftData2, CudaImage &img)
+{
+  int numPts = siftData1.numPts;
+#ifdef MANAGEDMEM
+  SiftPoint *sift1 = siftData1.m_data;
+  SiftPoint *sift2 = siftData2.m_data;
+#else
+  SiftPoint *sift1 = siftData1.h_data;
+  SiftPoint *sift2 = siftData2.h_data;
+#endif
+  float *h_img = img.h_data;
+  int w = img.width;
+  int h = img.height;
+  std::cout << std::setprecision(3);
+  for (int j = 0; j < numPts; j++)
+  {
+    int k = sift1[j].match;
+    if (sift1[j].match_error < 5)
+    {
+      float dx = sift2[k].xpos - sift1[j].xpos;
+      float dy = sift2[k].ypos - sift1[j].ypos;
+#if 0
+      if (false && sift1[j].xpos>550 && sift1[j].xpos<600) {
+	std::cout << "pos1=(" << (int)sift1[j].xpos << "," << (int)sift1[j].ypos << ") ";
+	std::cout << j << ": " << "score=" << sift1[j].score << "  ambiguity=" << sift1[j].ambiguity << "  match=" << k << "  ";
+	std::cout << "scale=" << sift1[j].scale << "  ";
+	std::cout << "error=" << (int)sift1[j].match_error << "  ";
+	std::cout << "orient=" << (int)sift1[j].orientation << "," << (int)sift2[k].orientation << "  ";
+	std::cout << " delta=(" << (int)dx << "," << (int)dy << ")" << std::endl;
+      }
+#endif
+#if 1
+      int len = (int)(fabs(dx) > fabs(dy) ? fabs(dx) : fabs(dy));
+      for (int l = 0; l < len; l++)
+      {
+        int x = (int)(sift1[j].xpos + dx * l / len);
+        int y = (int)(sift1[j].ypos + dy * l / len);
+        h_img[y * w + x] = 255.0f;
+      }
+#endif
+    }
+    int x = (int)(sift1[j].xpos + 0.5);
+    int y = (int)(sift1[j].ypos + 0.5);
+    int s = std::min(x, std::min(y, std::min(w - x - 2, std::min(h - y - 2, (int)(1.41 * sift1[j].scale)))));
+    int p = y * w + x;
+    p += (w + 1);
+    for (int k = 0; k < s; k++)
+      h_img[p - k] = h_img[p + k] = h_img[p - k * w] = h_img[p + k * w] = 0.0f;
+    p -= (w + 1);
+    for (int k = 0; k < s; k++)
+      h_img[p - k] = h_img[p + k] = h_img[p - k * w] = h_img[p + k * w] = 255.0f;
+  }
+  std::cout << std::setprecision(6);
+}
diff --git a/src/matching.dp.cpp b/src/matching.dp.cpp
new file mode 100644
index 0000000..275e561
--- /dev/null
+++ b/src/matching.dp.cpp
@@ -0,0 +1,1832 @@
+// Modifications Copyright (C) 2023 Intel Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+// OR OTHER DEALINGS IN THE SOFTWARE.
+
+// SPDX-License-Identifier: MIT
+
+#include <chrono>
+#include <sycl/sycl.hpp>
+#include <random>
+#include "infra/infra.hpp"
+#include "cudaSift.h"
+#include "cudautils.h"
+
+//================= Device matching functions =====================//
+
+void memcopyKernel(float *src, float *dst, size_t src_pitch, size_t dst_pitch, int numPts, size_t width)
+{
+  char *d_src = (char *)src;
+  char *d_dst = (char *)dst;
+
+  for (int i = 0; i < numPts; ++i)
+  {
+    for (int j = 0; j < width; ++j)
+    {
+      d_dst[j] = d_src[j];
+    }
+    d_src = d_src + src_pitch;
+    d_dst = d_dst + dst_pitch;
+  }
+}
+
+void MatchSiftPoints(SiftPoint *sift1, SiftPoint *sift2, float *corrData, int numPts1, int numPts2,
+                     sycl::nd_item<3> item_ct1, float *siftPoint, float *sums)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int p1 = item_ct1.get_group(2);
+  const int p2 = item_ct1.get_group(1) * 16 + ty;
+  const float *ptr1 = sift1[p1].data;
+  const float *ptr2 = sift2[p2].data;
+  const int i = 16 * ty + tx;
+  if (ty < 8)
+    siftPoint[i] = ptr1[i];
+
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+  float sum = 0.0f;
+  if (p2 < numPts2)
+    for (int j = 0; j < 8; j++)
+      sum += siftPoint[16 * j + tx] * ptr2[16 * j + tx];
+  sums[i] = sum;
+
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+  if (tx < 8)
+    sums[i] += sums[i + 8];
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+  if (tx < 4)
+    sums[i] += sums[i + 4];
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+  if (ty == 0)
+  {
+    sum = sums[16 * tx + 0] + sums[16 * tx + 1] + sums[16 * tx + 2] + sums[16 * tx + 3];
+    corrData[p1 * item_ct1.get_group_range(1) * 16 +
+             item_ct1.get_group(1) * 16 + tx] = sum;
+  }
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+}
+
+void MatchSiftPoints2(SiftPoint *sift1, SiftPoint *sift2, float *corrData, int numPts1, int numPts2,
+                      sycl::nd_item<3> item_ct1, float *siftPoints1,
+                      float *siftPoints2)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const float *ptr1 =
+      sift1[sycl::min((unsigned int)(numPts1 - 1),
+                      (unsigned int)(item_ct1.get_group(2) * 16 + ty))]
+          .data;
+  const float *ptr2 =
+      sift2[sycl::min((unsigned int)(numPts2 - 1),
+                      (unsigned int)(item_ct1.get_group(1) * 16 + ty))]
+          .data;
+  for (int i = 0; i < 8; i++)
+  {
+    siftPoints1[128 * ty + 16 * i + tx] = ptr1[16 * i + tx];
+    siftPoints2[128 * ty + 16 * i + tx] = ptr2[16 * i + tx];
+  }
+  item_ct1.barrier();
+  const int p1 = item_ct1.get_group(2) * 16 + ty;
+  const int p2 = item_ct1.get_group(1) * 16 + tx;
+  const float *pt1 = &siftPoints1[ty * 128];
+  const float *pt2 = &siftPoints2[tx * 128];
+  float sum = 0.0f;
+  for (int i = 0; i < 128; i++)
+  {
+    int itx = (i + tx) & 127; // avoid bank conflicts
+    sum += pt1[itx] * pt2[itx];
+  }
+  if (p1 < numPts1)
+    corrData[p1 * item_ct1.get_group_range(1) * 16 + p2] =
+        (p2 < numPts2 ? sum : -1.0f);
+}
+
+void FindMaxCorr(float *corrData, SiftPoint *sift1, SiftPoint *sift2, int numPts1, int corrWidth, int siftSize,
+                 sycl::nd_item<3> item_ct1, float *maxScore, float *maxScor2,
+                 int *maxIndex)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int idx = ty * 16 + tx;
+  int p1 = item_ct1.get_group(2) * 16 + item_ct1.get_local_id(1);
+  p1 = (p1 >= numPts1 ? numPts1 - 1 : p1);
+  maxScore[idx] = -1.0f;
+  maxScor2[idx] = -1.0f;
+  maxIndex[idx] = -1;
+  item_ct1.barrier();
+  float *corrs = &corrData[p1 * corrWidth];
+  for (int i = tx; i < corrWidth; i += 16)
+  {
+    float val = corrs[i];
+    if (val > maxScore[idx])
+    {
+      maxScor2[idx] = maxScore[idx];
+      maxScore[idx] = val;
+      maxIndex[idx] = i;
+    }
+    else if (val > maxScor2[idx])
+      maxScor2[idx] = val;
+  }
+  item_ct1.barrier();
+  for (int len = 8; len > 0; len /= 2)
+  {
+    if (tx < 8)
+    {
+      float val = maxScore[idx + len];
+      int i = maxIndex[idx + len];
+      if (val > maxScore[idx])
+      {
+        maxScor2[idx] = maxScore[idx];
+        maxScore[idx] = val;
+        maxIndex[idx] = i;
+      }
+      else if (val > maxScor2[idx])
+        maxScor2[idx] = val;
+      float va2 = maxScor2[idx + len];
+      if (va2 > maxScor2[idx])
+        maxScor2[idx] = va2;
+    }
+    item_ct1.barrier();
+  }
+  if (tx == 0)
+  {
+    sift1[p1].score = maxScore[ty * 16];
+    sift1[p1].ambiguity = maxScor2[ty * 16] / (maxScore[ty * 16] + 1e-6);
+    sift1[p1].match = maxIndex[ty * 16];
+    sift1[p1].match_xpos = sift2[maxIndex[ty * 16]].xpos;
+    sift1[p1].match_ypos = sift2[maxIndex[ty * 16]].ypos;
+  }
+}
+
+// Version based on suggestion by Nicholas Lin
+void FindMaxCorr3(float *corrData, SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2,
+                  sycl::nd_item<3> item_ct1, int *maxIndex)
+{
+  int block_dim = item_ct1.get_local_range().get(2); // blockDim.x == 16
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int p1 = item_ct1.get_group(2) * block_dim + ty;
+  const int idx = ty * 16 + tx;
+
+  maxIndex[idx] = 0;
+  item_ct1.barrier();
+
+  float *corrs = NULL;
+  if (p1 < numPts1)
+  {
+    corrs = &corrData[p1 * block_dim * 2];
+    corrs[tx] = 0.0f;
+    corrs[tx + 16] = 0.0f;
+    const float *pt1 = sift1[p1].data;
+    for (int p2 = tx; p2 < numPts2; p2 += 16)
+    {
+      float *pt2 = sift2[p2].data;
+      float sum = 0.0f;
+      for (int i = 0; i < 128; i++)
+        sum += pt1[i] * pt2[i];
+      if (sum > corrs[tx])
+      {
+        corrs[tx + 16] = corrs[tx];
+        corrs[tx] = sum;
+        maxIndex[idx] = p2;
+      }
+      else if (sum > corrs[tx + 16])
+        corrs[tx + 16] = sum;
+    }
+  }
+  item_ct1.barrier();
+  if (p1 < numPts1)
+  {
+    for (int len = 8; len > 0; len /= 2)
+    {
+      if (tx < len)
+      {
+        float val = corrs[tx + len];
+        int i = maxIndex[idx + len];
+        if (val > corrs[tx])
+        {
+          corrs[tx + 16] = corrs[tx];
+          corrs[tx] = val;
+          maxIndex[idx] = i;
+        }
+        else if (val > corrs[tx + 16])
+          corrs[tx + 16] = val;
+        float va2 = corrs[tx + 16 + len];
+        if (va2 > corrs[tx + 16])
+          corrs[tx + 16] = va2;
+      }
+      item_ct1.barrier();
+    }
+    if (tx == 0)
+    {
+      sift1[p1].score = corrs[0];
+      sift1[p1].ambiguity = corrs[16] / (corrs[0] + 1e-6);
+      sift1[p1].match = maxIndex[ty << 4];
+      sift1[p1].match_xpos = sift2[maxIndex[ty << 4]].xpos;
+      sift1[p1].match_ypos = sift2[maxIndex[ty << 4]].ypos;
+    }
+  }
+}
+
+#define FMC2W 16
+#define FMC2H 4
+
+void FindMaxCorr2(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2,
+                  sycl::nd_item<3> item_ct1, float *siftPoint, float *maxScore,
+                  float *maxScor2, int *maxIndex)
+{
+
+  const int p1 = item_ct1.get_group(2);
+  if (p1 >= numPts1)
+    return;
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int idx = ty * FMC2W + tx;
+  if (idx < FMC2H)
+  {
+    maxScore[idx] = -1.0f;
+    maxScor2[idx] = -1.0f;
+    maxIndex[idx] = 0;
+  }
+  item_ct1.barrier();
+  const float *pt1 = sift1[p1].data;
+  for (int i = idx; i < 128; i += FMC2W * FMC2H)
+    siftPoint[i] = pt1[i];
+
+  item_ct1.barrier();
+  for (int p2 = ty; p2 < numPts2; p2 += FMC2H)
+  {
+    const float *pt2 = sift2[p2].data;
+    float sum = 0.0f;
+    for (int j = tx; j < 128; j += FMC2W)
+      sum += siftPoint[j] * pt2[j];
+    for (int j = FMC2W / 2; j > 0; j /= 2)
+      sum += ShiftDown(sum, j, item_ct1);
+    if (tx == 0)
+    {
+      if (sum > maxScore[ty])
+      {
+        maxScor2[ty] = maxScore[ty];
+        maxScore[ty] = sum;
+        maxIndex[ty] = p2;
+      }
+      else if (sum > maxScor2[ty])
+        maxScor2[ty] = sum;
+    }
+  }
+
+  item_ct1.barrier();
+  for (int len = FMC2H / 2; len > 0; len /= 2)
+  {
+    if (ty == 0 && tx < len)
+    {
+      float val = maxScore[tx + len];
+      int p2 = maxIndex[tx + len];
+      if (val > maxScore[tx])
+      {
+        maxScor2[tx] = maxScore[tx];
+        maxScore[tx] = val;
+        maxIndex[tx] = p2;
+      }
+      else if (val > maxScor2[tx])
+        maxScor2[tx] = val;
+      float va2 = maxScor2[tx + len];
+      if (va2 > maxScor2[tx])
+        maxScor2[tx] = va2;
+    }
+
+    item_ct1.barrier();
+  }
+  if (ty == 0 && tx == 0)
+  {
+    sift1[p1].score = maxScore[0];
+    sift1[p1].ambiguity = maxScor2[0] / (maxScore[0] + 1e-6);
+    sift1[p1].match = maxIndex[0];
+    sift1[p1].match_xpos = sift2[maxIndex[0]].xpos;
+    sift1[p1].match_ypos = sift2[maxIndex[0]].ypos;
+  }
+}
+
+void FindMaxCorr4(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2,
+                  sycl::nd_item<3> item_ct1, float *siftPoint, float *maxScore,
+                  float *maxScor2, int *maxIndex)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  if (tx == 0)
+  {
+    maxScore[ty] = -1.0f;
+    maxScor2[ty] = -1.0f;
+    maxIndex[ty] = 0;
+  }
+  const int p1 = item_ct1.get_group(2) * FMC2H + ty;
+  const float *pt1 = sift1[p1].data;
+  for (int j = tx; j < 128; j += FMC2W)
+    siftPoint[128 * ty + j] = pt1[j];
+
+  item_ct1.barrier();
+  for (int p2 = 0; p2 < numPts2; p2++)
+  {
+    const float *pt2 = sift2[p2].data;
+    float sum = 0.0f;
+    for (int j = tx; j < 128; j += FMC2W)
+      sum += siftPoint[128 * ty + j] * pt2[j];
+    for (int j = FMC2W / 2; j > 0; j /= 2)
+      sum += ShiftDown(sum, j, item_ct1);
+    if (tx == 0)
+    {
+      if (sum > maxScore[ty])
+      {
+        maxScor2[ty] = maxScore[ty];
+        maxScore[ty] = sum;
+        maxIndex[ty] = p2;
+      }
+      else if (sum > maxScor2[ty])
+        maxScor2[ty] = sum;
+    }
+  }
+
+  item_ct1.barrier();
+  if (tx == 0)
+  {
+    sift1[p1].score = maxScore[ty];
+    sift1[p1].ambiguity = maxScor2[ty] / (maxScore[ty] + 1e-6);
+    sift1[p1].match = maxIndex[ty];
+    sift1[p1].match_xpos = sift2[maxIndex[ty]].xpos;
+    sift1[p1].match_ypos = sift2[maxIndex[ty]].ypos;
+  }
+}
+
+void CleanMatches(SiftPoint *sift1, int numPts1, sycl::nd_item<3> item_ct1)
+{
+  const int p1 = sycl::min(
+      (unsigned int)(item_ct1.get_group(2) * 64 + item_ct1.get_local_id(2)),
+      (unsigned int)(numPts1 - 1));
+  sift1[p1].score = 0.0f;
+}
+
+#define M7W 32
+#define M7H 32
+#define M7R 4
+#define NRX 2
+#define NDIM 128
+
+void FindMaxCorr10(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2,
+                   sycl::nd_item<3> item_ct1, sycl::float4 *buffer1,
+                   sycl::float4 *buffer2)
+{
+
+  int tx = item_ct1.get_local_id(2);
+  int ty = item_ct1.get_local_id(1);
+  int bp1 = M7W * item_ct1.get_group(2);
+  for (int j = ty; j < M7W; j += M7H / M7R)
+  {
+    int p1 = sycl::min((int)(bp1 + j), (int)(numPts1 - 1));
+    for (int d = tx; d < NDIM / 4; d += M7W)
+    {
+      buffer1[(j * NDIM / 4 + (d + j) % (NDIM / 4))] = ((sycl::float4 *)&sift1[p1].data)[d];
+      // int idx = j * NDIM / 4 + (d + j) % (NDIM / 4);
+      // if (idx < 1024)
+      //   buffer1[idx] = 0;
+    }
+  }
+
+  float max_score[NRX];
+  float sec_score[NRX];
+  int index[NRX];
+  for (int i = 0; i < NRX; i++)
+  {
+    max_score[i] = 0.0f;
+    sec_score[i] = 0.0f;
+    index[i] = -1;
+  }
+  int idx = ty * M7W + tx;
+  int ix = idx % (M7W / NRX);
+  int iy = idx / (M7W / NRX);
+  for (int bp2 = 0; bp2 < numPts2 - M7H + 1; bp2 += M7H)
+  {
+    for (int j = ty; j < M7H; j += M7H / M7R)
+    {
+      int p2 = sycl::min((int)(bp2 + j), (int)(numPts2 - 1));
+      for (int d = tx; d < NDIM / 4; d += M7W)
+        buffer2[j * NDIM / 4 + d] = ((sycl::float4 *)&sift2[p2].data)[d];
+    }
+
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    if (idx < M7W * M7H / M7R / NRX)
+    {
+      float score[M7R][NRX];
+      for (int dy = 0; dy < M7R; dy++)
+        for (int i = 0; i < NRX; i++)
+          score[dy][i] = 0.0f;
+      for (int d = 0; d < NDIM / 4; d++)
+      {
+        sycl::float4 v1[NRX];
+        for (int i = 0; i < NRX; i++)
+          v1[i] = buffer1[((M7W / NRX) * i + ix) * NDIM / 4 + (d + (M7W / NRX) * i + ix) % (NDIM / 4)];
+        // v1[i] = buffer2[0];
+        for (int dy = 0; dy < M7R; dy++)
+        {
+          sycl::float4 v2 = buffer2[(M7R * iy + dy) * (NDIM / 4) + d];
+          // sycl::float4 v2 = sycl::float4(0.0f);
+          for (int i = 0; i < NRX; i++)
+          {
+            score[dy][i] += v1[i].x() * v2.x();
+            score[dy][i] += v1[i].y() * v2.y();
+            score[dy][i] += v1[i].z() * v2.z();
+            score[dy][i] += v1[i].w() * v2.w();
+          }
+        }
+      }
+      for (int dy = 0; dy < M7R; dy++)
+      {
+        for (int i = 0; i < NRX; i++)
+        {
+          if (score[dy][i] > max_score[i])
+          {
+            sec_score[i] = max_score[i];
+            max_score[i] = score[dy][i];
+            index[i] =
+                sycl::min((int)(bp2 + M7R * iy + dy), (int)(numPts2 - 1));
+          }
+          else if (score[dy][i] > sec_score[i])
+            sec_score[i] = score[dy][i];
+        }
+      }
+    }
+
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+  }
+
+  float *scores1 = (float *)buffer1;
+  float *scores2 = &scores1[M7W * M7H / M7R];
+  int *indices = (int *)&scores2[M7W * M7H / M7R];
+  if (idx < M7W * M7H / M7R / NRX)
+  {
+    for (int i = 0; i < NRX; i++)
+    {
+      scores1[iy * M7W + (M7W / NRX) * i + ix] = max_score[i];
+      scores2[iy * M7W + (M7W / NRX) * i + ix] = sec_score[i];
+      indices[iy * M7W + (M7W / NRX) * i + ix] = index[i];
+    }
+  }
+
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  if (ty == 0)
+  {
+    float max_score = scores1[tx];
+    float sec_score = scores2[tx];
+    int index = indices[tx];
+    for (int y = 0; y < M7H / M7R; y++)
+      if (index != indices[y * M7W + tx])
+      {
+        if (scores1[y * M7W + tx] > max_score)
+        {
+          sec_score = sycl::max(max_score, sec_score);
+          max_score = scores1[y * M7W + tx];
+          index = indices[y * M7W + tx];
+        }
+        else if (scores1[y * M7W + tx] > sec_score)
+          sec_score = scores1[y * M7W + tx];
+      }
+    sift1[bp1 + tx].score = max_score;
+    // sift1[bp1 + tx].score = max_score[0];
+    sift1[bp1 + tx].match = index;
+    sift1[bp1 + tx].match_xpos = sift2[index].xpos;
+    sift1[bp1 + tx].match_ypos = sift2[index].ypos;
+    sift1[bp1 + tx].ambiguity = sec_score / (max_score + 1e-6f);
+  }
+}
+
+#define FMC_GH 512
+#define FMC_BW 32
+#define FMC_BH 32
+#define FMC_BD 16
+#define FMC_TW 1
+#define FMC_TH 4
+#define FMC_NW (FMC_BW / FMC_TW) //  32
+#define FMC_NH (FMC_BH / FMC_TH) //   8
+#define FMC_NT (FMC_NW * FMC_NH) // 256 = 8 warps
+
+infra::global_memory<volatile int, 0> lock(0);
+
+void FindMaxCorr9(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2,
+                  sycl::nd_item<3> item_ct1, volatile int *lock,
+                  sycl::float4 *siftParts1, sycl::float4 *siftParts2)
+{
+  // 4*32*8 = 1024
+  // 4*32*8 = 1024
+  //__shared__ float blksums[FMC_BW*FMC_BH];     // 32*32  = 1024
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int idx = ty * FMC_NW + tx;
+  sycl::float4 *pts1 = 0, *pts2 = 0;
+  if (idx < FMC_BW)
+  {
+    const int p1l =
+        sycl::min((unsigned int)(item_ct1.get_group(2) * FMC_BW + idx),
+                  (unsigned int)(numPts1 - 1));
+    pts1 = (sycl::float4 *)sift1[p1l].data;
+  }
+  float maxScore = -1.0f;
+  float maxScor2 = -1.0f;
+  int maxIndex = 0;
+  for (int k = 0; k < sycl::min(FMC_GH, (int)(numPts2 - FMC_BH + 1));
+       k += FMC_BH)
+  {
+    if (idx < FMC_BH)
+    {
+      const int p2l =
+          sycl::min((unsigned int)(item_ct1.get_group(1) * FMC_GH + k + idx),
+                    (unsigned int)(numPts2 - 1));
+      pts2 = (sycl::float4 *)sift2[p2l].data;
+    }
+    float sums[FMC_TW * FMC_TH];
+    for (int i = 0; i < FMC_TW * FMC_TH; i++)
+      sums[i] = 0.0f;
+
+    if (idx < FMC_BW)
+      for (int i = 0; i < FMC_BD / 2; i++)
+        siftParts1[(i + 0) * FMC_BW + idx] = pts1[0 + i];
+    if (idx < FMC_BH)
+      for (int i = 0; i < FMC_BD / 2; i++)
+        siftParts2[(i + 0) * FMC_BH + idx] = pts2[0 + i];
+
+    item_ct1.barrier();
+
+    int b = FMC_BD / 2;
+    for (int d = FMC_BD / 2; d < 32; d += FMC_BD / 2)
+    {
+      if (idx < FMC_BW)
+        for (int i = 0; i < FMC_BD / 2; i++)
+          siftParts1[(i + b) * FMC_BW + idx] = pts1[d + i];
+      if (idx < FMC_BH)
+        for (int i = 0; i < FMC_BD / 2; i++)
+          siftParts2[(i + b) * FMC_BH + idx] = pts2[d + i];
+
+      b ^= FMC_BD / 2;
+      for (int i = 0; i < FMC_BD / 2; i++)
+      {
+        sycl::float4 v1[FMC_TW];
+        for (int ix = 0; ix < FMC_TW; ix++)
+          v1[ix] = siftParts1[(i + b) * FMC_BW + (tx * FMC_TW + ix)];
+        for (int iy = 0; iy < FMC_TH; iy++)
+        {
+          sycl::float4 v2 = siftParts2[(i + b) * FMC_BH + (ty * FMC_TH + iy)];
+          for (int ix = 0; ix < FMC_TW; ix++)
+          {
+            sums[iy * FMC_TW + ix] += v1[ix].x() * v2.x();
+            sums[iy * FMC_TW + ix] += v1[ix].y() * v2.y();
+            sums[iy * FMC_TW + ix] += v1[ix].z() * v2.z();
+            sums[iy * FMC_TW + ix] += v1[ix].w() * v2.w();
+          }
+        }
+      }
+
+      item_ct1.barrier();
+    }
+
+    b ^= FMC_BD / 2;
+    for (int i = 0; i < FMC_BD / 2; i++)
+    {
+      sycl::float4 v1[FMC_TW];
+      for (int ix = 0; ix < FMC_TW; ix++)
+        v1[ix] = siftParts1[(i + b) * FMC_BW + (tx * FMC_TW + ix)];
+      for (int iy = 0; iy < FMC_TH; iy++)
+      {
+        sycl::float4 v2 = siftParts2[(i + b) * FMC_BH + (ty * FMC_TH + iy)];
+        for (int ix = 0; ix < FMC_TW; ix++)
+        {
+          sums[iy * FMC_TW + ix] += v1[ix].x() * v2.x();
+          sums[iy * FMC_TW + ix] += v1[ix].y() * v2.y();
+          sums[iy * FMC_TW + ix] += v1[ix].z() * v2.z();
+          sums[iy * FMC_TW + ix] += v1[ix].w() * v2.w();
+        }
+      }
+    }
+
+    item_ct1.barrier();
+
+    float *blksums = (float *)siftParts1;
+    for (int iy = 0; iy < FMC_TH; iy++)
+      for (int ix = 0; ix < FMC_TW; ix++)
+        blksums[(ty * FMC_TH + iy) * FMC_BW + (tx * FMC_TW + ix)] = sums[iy * FMC_TW + ix];
+
+    item_ct1.barrier();
+    if (idx < FMC_BW)
+    {
+      for (int j = 0; j < FMC_BH; j++)
+      {
+        float sum = blksums[j * FMC_BW + idx];
+        if (sum > maxScore)
+        {
+          maxScor2 = maxScore;
+          maxScore = sum;
+          maxIndex =
+              sycl::min((unsigned int)(item_ct1.get_group(1) * FMC_GH + k + j),
+                        (unsigned int)(numPts2 - 1));
+        }
+        else if (sum > maxScor2)
+          maxScor2 = sum;
+      }
+    }
+
+    item_ct1.barrier();
+  }
+  const int p1 = sycl::min((unsigned int)(item_ct1.get_group(2) * FMC_BW + idx),
+                           (unsigned int)(numPts1 - 1));
+  if (idx == 0)
+    while (infra::atomic_compare_exchange_strong((int *)lock, 0, 1) != 0)
+      ;
+
+  item_ct1.barrier();
+  if (idx < FMC_BW)
+  {
+    float maxScor2Old = sift1[p1].ambiguity * (sift1[p1].score + 1e-6f);
+    if (maxScore > sift1[p1].score)
+    {
+      maxScor2 = sycl::max(sift1[p1].score, maxScor2);
+      sift1[p1].ambiguity = maxScor2 / (maxScore + 1e-6f);
+      sift1[p1].score = maxScore;
+      sift1[p1].match = maxIndex;
+      sift1[p1].match_xpos = sift2[maxIndex].xpos;
+      sift1[p1].match_ypos = sift2[maxIndex].ypos;
+    }
+    else if (maxScore > maxScor2Old)
+      sift1[p1].ambiguity = maxScore / (sift1[p1].score + 1e-6f);
+  }
+
+  item_ct1.barrier();
+  if (idx == 0)
+    infra::atomic_exchange((int *)lock, 0);
+}
+
+void FindMaxCorr8(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2,
+                  sycl::nd_item<3> item_ct1, volatile int *lock,
+                  sycl::float4 *siftParts1, sycl::float4 *siftParts2,
+                  float *blksums)
+{
+  // 4*32*8 = 1024
+  // 4*32*8 = 1024
+  // 32*32  = 1024
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int idx = ty * FMC_NW + tx;
+  sycl::float4 *pts1 = 0, *pts2 = 0;
+  if (idx < FMC_BW)
+  {
+    const int p1l =
+        sycl::min((unsigned int)(item_ct1.get_group(2) * FMC_BW + idx),
+                  (unsigned int)(numPts1 - 1));
+    pts1 = (sycl::float4 *)sift1[p1l].data;
+  }
+  float maxScore = -1.0f;
+  float maxScor2 = -1.0f;
+  int maxIndex = 0;
+  for (int k = 0; k < sycl::min(FMC_GH, (int)(numPts2 - FMC_BH + 1));
+       k += FMC_BH)
+  {
+    if (idx < FMC_BH)
+    {
+      const int p2l =
+          sycl::min((unsigned int)(item_ct1.get_group(1) * FMC_GH + k + idx),
+                    (unsigned int)(numPts2 - 1));
+      pts2 = (sycl::float4 *)sift2[p2l].data;
+    }
+    float sums[FMC_TW * FMC_TH];
+    for (int i = 0; i < FMC_TW * FMC_TH; i++)
+      sums[i] = 0.0f;
+    for (int d = 0; d < 32; d += FMC_BD)
+    {
+      if (idx < FMC_BW)
+        for (int i = 0; i < FMC_BD; i++)
+          siftParts1[i * FMC_BW + idx] = pts1[d + i];
+      if (idx < FMC_BH)
+        for (int i = 0; i < FMC_BD; i++)
+          siftParts2[i * FMC_BH + idx] = pts2[d + i];
+
+      item_ct1.barrier();
+
+      for (int i = 0; i < FMC_BD; i++)
+      {
+        sycl::float4 v1[FMC_TW];
+        for (int ix = 0; ix < FMC_TW; ix++)
+          v1[ix] = siftParts1[i * FMC_BW + (tx * FMC_TW + ix)];
+        for (int iy = 0; iy < FMC_TH; iy++)
+        {
+          sycl::float4 v2 = siftParts2[i * FMC_BH + (ty * FMC_TH + iy)];
+          for (int ix = 0; ix < FMC_TW; ix++)
+          {
+            sums[iy * FMC_TW + ix] += v1[ix].x() * v2.x();
+            sums[iy * FMC_TW + ix] += v1[ix].y() * v2.y();
+            sums[iy * FMC_TW + ix] += v1[ix].z() * v2.z();
+            sums[iy * FMC_TW + ix] += v1[ix].w() * v2.w();
+          }
+        }
+      }
+
+      item_ct1.barrier();
+    }
+    // float *blksums = (float*)siftParts1;
+    for (int iy = 0; iy < FMC_TH; iy++)
+      for (int ix = 0; ix < FMC_TW; ix++)
+        blksums[(ty * FMC_TH + iy) * FMC_BW + (tx * FMC_TW + ix)] = sums[iy * FMC_TW + ix];
+
+    item_ct1.barrier();
+    if (idx < FMC_BW)
+    {
+      for (int j = 0; j < FMC_BH; j++)
+      {
+        float sum = blksums[j * FMC_BW + idx];
+        if (sum > maxScore)
+        {
+          maxScor2 = maxScore;
+          maxScore = sum;
+          maxIndex =
+              sycl::min((unsigned int)(item_ct1.get_group(1) * FMC_GH + k + j),
+                        (unsigned int)(numPts2 - 1));
+        }
+        else if (sum > maxScor2)
+          maxScor2 = sum;
+      }
+    }
+
+    item_ct1.barrier();
+  }
+  const int p1 = sycl::min((unsigned int)(item_ct1.get_group(2) * FMC_BW + idx),
+                           (unsigned int)(numPts1 - 1));
+  if (idx == 0)
+    while (infra::atomic_compare_exchange_strong((int *)lock, 0, 1) != 0)
+      ;
+
+  item_ct1.barrier();
+  if (idx < FMC_BW)
+  {
+    float maxScor2Old = sift1[p1].ambiguity * (sift1[p1].score + 1e-6f);
+    if (maxScore > sift1[p1].score)
+    {
+      maxScor2 = sycl::max(sift1[p1].score, maxScor2);
+      sift1[p1].ambiguity = maxScor2 / (maxScore + 1e-6f);
+      sift1[p1].score = maxScore;
+      sift1[p1].match = maxIndex;
+      sift1[p1].match_xpos = sift2[maxIndex].xpos;
+      sift1[p1].match_ypos = sift2[maxIndex].ypos;
+    }
+    else if (maxScore > maxScor2Old)
+      sift1[p1].ambiguity = maxScore / (sift1[p1].score + 1e-6f);
+  }
+
+  item_ct1.barrier();
+  if (idx == 0)
+    infra::atomic_exchange((int *)lock, 0);
+}
+
+void FindMaxCorr7(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2,
+                  sycl::nd_item<3> item_ct1, volatile int *lock,
+                  float *siftParts1, float *siftParts2)
+{
+  // features in columns
+  // one extra to avoid shared conflicts
+  sycl::float4 *pts1 = (sycl::float4 *)siftParts1;
+  sycl::float4 *pts2 = (sycl::float4 *)siftParts2;
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int p1l = sycl::min((unsigned int)(item_ct1.get_group(2) * 16 + ty),
+                            (unsigned int)(numPts1 - 1));
+  const sycl::float4 *p1l4 = (sycl::float4 *)sift1[p1l].data;
+  float maxScore = -1.0f;
+  float maxScor2 = -1.0f;
+  int maxIndex = 0;
+  for (int k = 0; k < 512 / 16; k++)
+  {
+    const int p2l =
+        sycl::min((unsigned int)(item_ct1.get_group(1) * 512 + k * 16 + ty),
+                  (unsigned int)(numPts2 - 1));
+    const sycl::float4 *p2l4 = (sycl::float4 *)sift2[p2l].data;
+#define NUM 4
+    float sum[NUM];
+    if (ty < (16 / NUM))
+      for (int l = 0; l < NUM; l++)
+        sum[l] = 0.0f;
+
+    item_ct1.barrier();
+    for (int i = 0; i < 2; i++)
+    {
+      pts1[17 * tx + ty] = p1l4[i * 16 + tx];
+      pts2[16 * ty + tx] = p2l4[i * 16 + tx];
+
+      item_ct1.barrier();
+      if (ty < (16 / NUM))
+      {
+#pragma unroll
+        for (int j = 0; j < 16; j++)
+        {
+          sycl::float4 p1v = pts1[17 * j + tx];
+#pragma unroll
+          for (int l = 0; l < NUM; l++)
+          {
+            sycl::float4 p2v = pts2[16 * (ty + l * (16 / NUM)) + j];
+            sum[l] += p1v.x() * p2v.x();
+            sum[l] += p1v.y() * p2v.y();
+            sum[l] += p1v.z() * p2v.z();
+            sum[l] += p1v.w() * p2v.w();
+          }
+        }
+      }
+
+      item_ct1.barrier();
+    }
+    float *sums = siftParts1;
+    if (ty < (16 / NUM))
+      for (int l = 0; l < NUM; l++)
+        sums[16 * (ty + l * (16 / NUM)) + tx] = sum[l];
+
+    item_ct1.barrier();
+    if (ty == 0)
+    {
+      for (int j = 0; j < 16; j++)
+      {
+        float sum = sums[16 * j + tx];
+        if (sum > maxScore)
+        {
+          maxScor2 = maxScore;
+          maxScore = sum;
+          maxIndex = sycl::min(
+              (unsigned int)(item_ct1.get_group(1) * 512 + k * 16 + j),
+              (unsigned int)(numPts2 - 1));
+        }
+        else if (sum > maxScor2)
+          maxScor2 = sum;
+      }
+    }
+
+    item_ct1.barrier();
+  }
+  const int p1 = sycl::min((unsigned int)(item_ct1.get_group(2) * 16 + tx),
+                           (unsigned int)(numPts1 - 1));
+  if (tx == 0 && ty == 0)
+    while (infra::atomic_compare_exchange_strong((int *)lock, 0, 1) != 0)
+      ;
+
+  item_ct1.barrier();
+  if (ty == 0)
+  {
+    float maxScor2Old = sift1[p1].ambiguity * (sift1[p1].score + 1e-6f);
+    if (maxScore > sift1[p1].score)
+    {
+      maxScor2 = sycl::max(sift1[p1].score, maxScor2);
+      sift1[p1].ambiguity = maxScor2 / (maxScore + 1e-6f);
+      sift1[p1].score = maxScore;
+      sift1[p1].match = maxIndex;
+      sift1[p1].match_xpos = sift2[maxIndex].xpos;
+      sift1[p1].match_ypos = sift2[maxIndex].ypos;
+    }
+    else if (maxScore > maxScor2Old)
+      sift1[p1].ambiguity = maxScore / (sift1[p1].score + 1e-6f);
+  }
+
+  item_ct1.barrier();
+  if (tx == 0 && ty == 0)
+    infra::atomic_exchange((int *)lock, 0);
+}
+
+void FindMaxCorr6(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2,
+                  sycl::nd_item<3> item_ct1, volatile int *lock,
+                  float *siftParts2, float *sums)
+{
+  //__shared__ float siftParts1[128*16]; // features in columns
+  // one extra to avoid shared conflicts
+
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int p1l = sycl::min((unsigned int)(item_ct1.get_group(2) * 16 + ty),
+                            (unsigned int)(numPts1 - 1));
+  float *pt1l = sift1[p1l].data;
+  sycl::float4 part1 = reinterpret_cast<sycl::float4 *>(pt1l)[tx];
+  float maxScore = -1.0f;
+  float maxScor2 = -1.0f;
+  int maxIndex = 0;
+  for (int k = 0; k < 512; k += 16)
+  {
+    const int p2l =
+        sycl::min((unsigned int)(item_ct1.get_group(1) * 512 + k + ty),
+                  (unsigned int)(numPts2 - 1));
+    float *pt2l = sift2[p2l].data;
+    reinterpret_cast<sycl::float4 *>(siftParts2)[32 * ty + tx] =
+        reinterpret_cast<sycl::float4 *>(pt2l)[tx];
+
+    item_ct1.barrier();
+    for (int i = 0; i < 16; i++)
+    {
+      sycl::float4 part2 =
+          reinterpret_cast<sycl::float4 *>(siftParts2)[32 * i + tx];
+      float sum = part1.x() * part2.x() + part1.y() * part2.y() +
+                  part1.z() * part2.z() + part1.w() * part2.w();
+      sum += ShiftDown(sum, 16, item_ct1);
+      sum += ShiftDown(sum, 8, item_ct1);
+      sum += ShiftDown(sum, 4, item_ct1);
+      sum += ShiftDown(sum, 2, item_ct1);
+      sum += ShiftDown(sum, 1, item_ct1);
+      if (tx == 0)
+        sums[16 * i + ty] = sum;
+    }
+
+    item_ct1.barrier();
+    if (ty == 0 && tx < 16)
+    {
+      for (int j = 0; j < 16; j++)
+      {
+        float sum = sums[16 * j + tx];
+        if (sum > maxScore)
+        {
+          maxScor2 = maxScore;
+          maxScore = sum;
+          maxIndex =
+              sycl::min((unsigned int)(item_ct1.get_group(1) * 512 + k + j),
+                        (unsigned int)(numPts2 - 1));
+        }
+        else if (sum > maxScor2)
+          maxScor2 = sum;
+      }
+    }
+
+    item_ct1.barrier();
+  }
+  if (tx == 0 && ty == 0)
+    while (infra::atomic_compare_exchange_strong((int *)lock, 0, 1) != 0)
+      ;
+
+  item_ct1.barrier();
+  if (ty == 0 && tx < 16)
+  {
+    const int p1 = sycl::min((unsigned int)(item_ct1.get_group(2) * 16 + tx),
+                             (unsigned int)(numPts1 - 1));
+    float maxScor2Old = sift1[p1].ambiguity * (sift1[p1].score + 1e-6f);
+    if (maxScore > sift1[p1].score)
+    {
+      maxScor2 = sycl::max(sift1[p1].score, maxScor2);
+      sift1[p1].ambiguity = maxScor2 / (maxScore + 1e-6f);
+      sift1[p1].score = maxScore;
+      sift1[p1].match = maxIndex;
+      sift1[p1].match_xpos = sift2[maxIndex].xpos;
+      sift1[p1].match_ypos = sift2[maxIndex].ypos;
+    }
+    else if (maxScore > maxScor2Old)
+      sift1[p1].ambiguity = maxScore / (sift1[p1].score + 1e-6f);
+  }
+  item_ct1.barrier();
+  if (tx == 0 && ty == 0)
+    infra::atomic_exchange((int *)lock, 0);
+}
+
+void FindMaxCorr5(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2,
+                  sycl::nd_item<3> item_ct1, volatile int *lock,
+                  float *siftParts1, float *siftParts2)
+{
+  // features in columns
+  // one extra to avoid shared conflicts
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int p1l = sycl::min((unsigned int)(item_ct1.get_group(2) * 16 + ty),
+                            (unsigned int)(numPts1 - 1));
+  const float *pt1l = sift1[p1l].data;
+  float maxScore = -1.0f;
+  float maxScor2 = -1.0f;
+  int maxIndex = 0;
+  for (int k = 0; k < 512 / 16; k++)
+  {
+    const int p2l =
+        sycl::min((unsigned int)(item_ct1.get_group(1) * 512 + k * 16 + ty),
+                  (unsigned int)(numPts2 - 1));
+    const float *pt2l = sift2[p2l].data;
+    float sum = 0.0f;
+    for (int i = 0; i < 8; i++)
+    {
+      siftParts1[17 * tx + ty] = pt1l[i * 16 + tx]; // load and transpose
+      siftParts2[17 * tx + ty] = pt2l[i * 16 + tx];
+      item_ct1.barrier();
+      for (int j = 0; j < 16; j++)
+        sum += siftParts1[17 * j + tx] * siftParts2[17 * j + ty];
+      item_ct1.barrier();
+    }
+    float *sums = siftParts1;
+    sums[16 * ty + tx] = sum;
+    item_ct1.barrier();
+    if (ty == 0)
+    {
+      for (int j = 0; j < 16; j++)
+      {
+        float sum = sums[16 * j + tx];
+        if (sum > maxScore)
+        {
+          maxScor2 = maxScore;
+          maxScore = sum;
+          maxIndex = sycl::min(
+              (unsigned int)(item_ct1.get_group(1) * 512 + k * 16 + j),
+              (unsigned int)(numPts2 - 1));
+        }
+        else if (sum > maxScor2)
+          maxScor2 = sum;
+      }
+    }
+    item_ct1.barrier();
+  }
+  const int p1 = sycl::min((unsigned int)(item_ct1.get_group(2) * 16 + tx),
+                           (unsigned int)(numPts1 - 1));
+  if (tx == 0 && ty == 0)
+    while (infra::atomic_compare_exchange_strong((int *)lock, 0, 1) != 0)
+      ;
+  item_ct1.barrier();
+  if (ty == 0)
+  {
+    float maxScor2Old = sift1[p1].ambiguity * (sift1[p1].score + 1e-6f);
+    if (maxScore > sift1[p1].score)
+    {
+      maxScor2 = sycl::max(sift1[p1].score, maxScor2);
+      sift1[p1].ambiguity = maxScor2 / (maxScore + 1e-6f);
+      sift1[p1].score = maxScore;
+      sift1[p1].match = maxIndex;
+      sift1[p1].match_xpos = sift2[maxIndex].xpos;
+      sift1[p1].match_ypos = sift2[maxIndex].ypos;
+    }
+    else if (maxScore > maxScor2Old)
+      sift1[p1].ambiguity = maxScore / (sift1[p1].score + 1e-6f);
+  }
+  item_ct1.barrier();
+  if (tx == 0 && ty == 0)
+    infra::atomic_exchange((int *)lock, 0);
+}
+
+template <int size>
+void InvertMatrix(float elem[size][size], float res[size][size])
+{
+  int indx[size];
+  float b[size];
+  float vv[size];
+  for (int i = 0; i < size; i++)
+    indx[i] = 0;
+  int imax = 0;
+  float d = 1.0;
+  for (int i = 0; i < size; i++)
+  { // find biggest element for each row
+    float big = 0.0;
+    for (int j = 0; j < size; j++)
+    {
+      float temp = sycl::fabs(elem[i][j]);
+      if (temp > big)
+        big = temp;
+    }
+    if (big > 0.0)
+      vv[i] = 1.0 / big;
+    else
+      vv[i] = 1e16;
+  }
+  for (int j = 0; j < size; j++)
+  {
+    for (int i = 0; i < j; i++)
+    {                                   // i<j
+      float sum = elem[i][j];           // i<j (lower left)
+      for (int k = 0; k < i; k++)       // k<i<j
+        sum -= elem[i][k] * elem[k][j]; // i>k (upper right), k<j (lower left)
+      elem[i][j] = sum;                 // i<j (lower left)
+    }
+    float big = 0.0;
+    for (int i = j; i < size; i++)
+    {                                   // i>=j
+      float sum = elem[i][j];           // i>=j (upper right)
+      for (int k = 0; k < j; k++)       // k<j<=i
+        sum -= elem[i][k] * elem[k][j]; // i>k (upper right), k<j (lower left)
+      elem[i][j] = sum;                 // i>=j (upper right)
+      float dum = vv[i] * sycl::fabs(sum);
+      if (dum >= big)
+      {
+        big = dum;
+        imax = i;
+      }
+    }
+    if (j != imax)
+    { // imax>j
+      for (int k = 0; k < size; k++)
+      {
+        float dum = elem[imax][k]; // upper right and lower left
+        elem[imax][k] = elem[j][k];
+        elem[j][k] = dum;
+      }
+      d = -d;
+      vv[imax] = vv[j];
+    }
+    indx[j] = imax;
+    if (elem[j][j] == 0.0) // j==j (upper right)
+      elem[j][j] = 1e-16;
+    if (j != (size - 1))
+    {
+      float dum = 1.0 / elem[j][j];
+      for (int i = j + 1; i < size; i++) // i>j
+        elem[i][j] *= dum;               // i>j (upper right)
+    }
+  }
+  for (int j = 0; j < size; j++)
+  {
+    for (int k = 0; k < size; k++)
+      b[k] = 0.0;
+    b[j] = 1.0;
+    int ii = -1;
+    for (int i = 0; i < size; i++)
+    {
+      int ip = indx[i];
+      float sum = b[ip];
+      b[ip] = b[i];
+      if (ii != -1)
+        for (int j = ii; j < i; j++)
+          sum -= elem[i][j] * b[j]; // i>j (upper right)
+      else if (sum != 0.0)
+        ii = i;
+      b[i] = sum;
+    }
+    for (int i = size - 1; i >= 0; i--)
+    {
+      float sum = b[i];
+      for (int j = i + 1; j < size; j++)
+        sum -= elem[i][j] * b[j]; // i<j (lower left)
+      b[i] = sum / elem[i][i];    // i==i (upper right)
+    }
+    for (int i = 0; i < size; i++)
+      res[i][j] = b[i];
+  }
+}
+
+void ComputeHomographies(float *coord, int *randPts, float *homo,
+                         int numPts, sycl::nd_item<3> item_ct1)
+{
+  float a[8][8], ia[8][8];
+  float b[8];
+  const int bx = item_ct1.get_group(2);
+  const int tx = item_ct1.get_local_id(2);
+  const int idx = item_ct1.get_local_range().get(2) * bx + tx;
+  const int numLoops =
+      item_ct1.get_local_range().get(2) * item_ct1.get_group_range(2);
+  for (int i = 0; i < 4; i++)
+  {
+    int pt = randPts[i * numLoops + idx];
+    float x1 = coord[pt + 0 * numPts];
+    float y1 = coord[pt + 1 * numPts];
+    float x2 = coord[pt + 2 * numPts];
+    float y2 = coord[pt + 3 * numPts];
+    float *row1 = a[2 * i + 0];
+    row1[0] = x1;
+    row1[1] = y1;
+    row1[2] = 1.0;
+    row1[3] = row1[4] = row1[5] = 0.0;
+    row1[6] = -x2 * x1;
+    row1[7] = -x2 * y1;
+    float *row2 = a[2 * i + 1];
+    row2[0] = row2[1] = row2[2] = 0.0;
+    row2[3] = x1;
+    row2[4] = y1;
+    row2[5] = 1.0;
+    row2[6] = -y2 * x1;
+    row2[7] = -y2 * y1;
+    b[2 * i + 0] = x2;
+    b[2 * i + 1] = y2;
+  }
+  InvertMatrix<8>(a, ia);
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+  for (int j = 0; j < 8; j++)
+  {
+    float sum = 0.0f;
+    for (int i = 0; i < 8; i++)
+      sum += ia[j][i] * b[i];
+    homo[j * numLoops + idx] = sum;
+  }
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+}
+
+#define TESTHOMO_TESTS 16 // number of tests per block,  alt. 32, 32
+#define TESTHOMO_LOOPS 16 // number of loops per block,  alt.  8, 16
+
+void TestHomographies(float *d_coord, float *d_homo,
+                      int *d_counts, int numPts, float thresh2, sycl::nd_item<3> item_ct1,
+                      float *homo, int *cnts)
+{
+
+  const int tx = item_ct1.get_local_id(2);
+  const int ty = item_ct1.get_local_id(1);
+  const int idx =
+      item_ct1.get_group(1) * item_ct1.get_local_range().get(1) + tx;
+  const int numLoops =
+      item_ct1.get_local_range().get(1) * item_ct1.get_group_range(1);
+  if (ty < 8 && tx < TESTHOMO_LOOPS)
+    homo[tx * 8 + ty] = d_homo[idx + ty * numLoops];
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+  float a[8];
+  for (int i = 0; i < 8; i++)
+    a[i] = homo[ty * 8 + i];
+  int cnt = 0;
+  for (int i = tx; i < numPts; i += TESTHOMO_TESTS)
+  {
+    float x1 = d_coord[i + 0 * numPts];
+    float y1 = d_coord[i + 1 * numPts];
+    float x2 = d_coord[i + 2 * numPts];
+    float y2 = d_coord[i + 3 * numPts];
+    float nomx = a[0] * x1 + a[1] * y1 + a[2];
+    float nomy = a[3] * x1 + a[4] * y1 + a[5];
+    float deno = a[6] * x1 + a[7] * y1 + 1.0f;
+    float errx = x2 * deno - nomx;
+    float erry = y2 * deno - nomy;
+    float err2 = errx * errx + erry * erry;
+    if (err2 < thresh2 * deno * deno)
+      cnt++;
+  }
+  int kty = TESTHOMO_TESTS * ty;
+  cnts[kty + tx] = cnt;
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+  int len = TESTHOMO_TESTS / 2;
+  while (len > 0)
+  {
+    if (tx < len)
+      cnts[kty + tx] += cnts[kty + tx + len];
+    len /= 2;
+    item_ct1.barrier();
+  }
+  if (tx < TESTHOMO_LOOPS && ty == 0)
+    d_counts[idx] = cnts[TESTHOMO_TESTS * tx];
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+}
+
+//================= Host matching functions =====================//
+
+double FindHomography(SiftData &data, float *homography, int *numMatches, sycl::queue &q_ct, float &matchTime, int numLoops, float minScore, float maxAmbiguity, float thresh)
+{
+  *numMatches = 0;
+  homography[0] = homography[4] = homography[8] = 1.0f;
+  homography[1] = homography[2] = homography[3] = 0.0f;
+  homography[5] = homography[6] = homography[7] = 0.0f;
+  if (data.d_data == NULL)
+    return 0.0f;
+  SiftPoint *d_sift = data.d_data;
+  numLoops = iDivUp(numLoops, 16) * 16;
+  int numPts = data.numPts;
+  if (numPts < 8)
+    return 0.0f;
+  int numPtsUp = iDivUp(numPts, 16) * 16;
+  float *d_coord, *d_homo;
+  int *d_randPts, *h_randPts;
+  int randSize = 4 * sizeof(int) * numLoops;
+  int szFl = sizeof(float);
+  int szPt = sizeof(SiftPoint);
+
+#ifdef DEVICE_TIMER
+  auto start_malloc_1 = std::chrono::steady_clock::now();
+#endif
+  d_coord = (float *)sycl::malloc_device(4 * sizeof(float) * numPtsUp, q_ct);
+  d_randPts = (int *)sycl::malloc_device(randSize, q_ct);
+  d_homo = (float *)sycl::malloc_device(8 * sizeof(float) * numLoops, q_ct);
+
+#ifdef DEVICE_TIMER
+  auto stop_malloc_1 = std::chrono::steady_clock::now();
+  matchTime += std::chrono::duration<float, std::micro>(stop_malloc_1 - start_malloc_1).count();
+#endif
+  h_randPts = (int *)malloc(randSize);
+  float *h_scores = (float *)malloc(sizeof(float) * numPtsUp);
+  float *h_ambiguities = (float *)malloc(sizeof(float) * numPtsUp);
+  float *temp1 = (float *)malloc(szPt * numPtsUp);
+  float *temp2 = (float *)malloc(szPt * numPtsUp);
+
+#ifdef DEVICE_TIMER
+  auto start_memcpy_1 = std::chrono::steady_clock::now();
+#endif
+
+  infra::sift_memcpy(temp1, &d_sift[0].score, szPt * numPts, infra::device_to_host, q_ct);
+  infra::sift_memcpy(temp2, &d_sift[0].ambiguity, szPt * numPts, infra::device_to_host, q_ct);
+  q_ct.wait();
+
+#ifdef DEVICE_TIMER
+  auto stop_memcpy_1 = std::chrono::steady_clock::now();
+  matchTime += std::chrono::duration<float, std::micro>(stop_memcpy_1 - start_memcpy_1).count();
+#endif
+  char *src_score = (char *)temp1;
+  char *src_ambiguity = (char *)temp2;
+  char *dst_score = (char *)h_scores;
+  char *dst_ambiguity = (char *)h_ambiguities;
+
+  for (int i = 0; i < numPts; ++i)
+  {
+    memcpy(dst_score, src_score, szFl);
+    memcpy(dst_ambiguity, src_ambiguity, szFl);
+
+    src_score += szPt;
+    src_ambiguity += szPt;
+    dst_score += szFl;
+    dst_ambiguity += szFl;
+  }
+
+  int *validPts = (int *)malloc(sizeof(int) * numPts);
+  int numValid = 0;
+
+  for (int i = 0; i < numPts; i++)
+  {
+    if (h_scores[i] > minScore && h_ambiguities[i] < maxAmbiguity)
+      validPts[numValid++] = i;
+  }
+
+  free(h_scores);
+  free(h_ambiguities);
+
+  if (numValid >= 8)
+  {
+    std::random_device rd;
+    uint32_t seed = rd();
+    std::mt19937 rnd(seed); // mersenne_twister_engine
+    std::uniform_int_distribution<uint32_t> dis(0, UINT32_MAX);
+    for (int i = 0; i < numLoops; i++)
+    {
+      int p1 = dis(rnd) % numValid;
+      int p2 = dis(rnd) % numValid;
+      int p3 = dis(rnd) % numValid;
+      int p4 = dis(rnd) % numValid;
+      while (p2 == p1)
+        p2 = dis(rnd) % numValid;
+      while (p3 == p1 || p3 == p2)
+        p3 = dis(rnd) % numValid;
+      while (p4 == p1 || p4 == p2 || p4 == p3)
+        p4 = dis(rnd) % numValid;
+      h_randPts[i + 0 * numLoops] = validPts[p1];
+      h_randPts[i + 1 * numLoops] = validPts[p2];
+      h_randPts[i + 2 * numLoops] = validPts[p3];
+      h_randPts[i + 3 * numLoops] = validPts[p4];
+    }
+#ifdef DEVICE_TIMER
+    auto start_malloc_2 = std::chrono::steady_clock::now();
+#endif
+    float *temp3 = (float *)sycl::malloc_device(szPt * numPtsUp, q_ct);
+    float *temp4 = (float *)sycl::malloc_device(szPt * numPtsUp, q_ct);
+    float *temp5 = (float *)sycl::malloc_device(szPt * numPtsUp, q_ct);
+    float *temp6 = (float *)sycl::malloc_device(szPt * numPtsUp, q_ct);
+#ifdef DEVICE_TIMER
+    auto stop_malloc_2 = std::chrono::steady_clock::now();
+    matchTime += std::chrono::duration<float, std::micro>(stop_malloc_2 - start_malloc_2).count();
+#endif
+#ifdef DEVICE_TIMER
+    auto start_memcpy_2 = std::chrono::steady_clock::now();
+#endif
+
+    q_ct.memcpy(d_randPts, h_randPts, randSize).wait();
+    infra::sift_memcpy(temp3, &d_sift[0].xpos, szPt * numPts, infra::device_to_device, q_ct);
+    infra::sift_memcpy(temp4, &d_sift[0].ypos, szPt * numPts, infra::device_to_device, q_ct);
+    infra::sift_memcpy(temp5, &d_sift[0].match_xpos, szPt * numPts, infra::device_to_device, q_ct);
+    infra::sift_memcpy(temp6, &d_sift[0].match_ypos, szPt * numPts, infra::device_to_device, q_ct);
+    q_ct.wait();
+
+    // kernel call to transfer memory from device to device(replaced 2d memcopies are 2d copying is slower on sycl)
+    q_ct.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1) *
+                                  sycl::range<3>(1, 1, 1),
+                              sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]]
+            {
+              memcopyKernel(temp3, &d_coord[0 * numPtsUp], szPt, szFl, numPts, szFl);
+            })
+        .wait();
+
+    q_ct.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1) *
+                                  sycl::range<3>(1, 1, 1),
+                              sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]]
+            {
+              memcopyKernel(temp4, &d_coord[1 * numPtsUp], szPt, szFl, numPts, szFl);
+            })
+        .wait();
+
+    q_ct.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1) *
+                                  sycl::range<3>(1, 1, 1),
+                              sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]]
+            {
+              memcopyKernel(temp5, &d_coord[2 * numPtsUp], szPt, szFl, numPts, szFl);
+            })
+        .wait();
+
+    q_ct.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1) *
+                                  sycl::range<3>(1, 1, 1),
+                              sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]]
+            {
+              memcopyKernel(temp6, &d_coord[3 * numPtsUp], szPt, szFl, numPts, szFl);
+            })
+        .wait();
+#ifdef DEVICE_TIMER
+    auto stop_memcpy_2 = std::chrono::steady_clock::now();
+    matchTime += std::chrono::duration<float, std::micro>(stop_memcpy_2 - start_memcpy_2).count();
+#endif
+
+#ifdef DEVICE_TIMER
+    auto start_kernel_1 = std::chrono::steady_clock::now();
+#endif
+    q_ct.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, numLoops / 16) *
+                                  sycl::range<3>(1, 1, 16),
+                              sycl::range<3>(1, 1, 16)),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]]
+            {
+              ComputeHomographies(d_coord, d_randPts, d_homo, numPtsUp, item_ct1);
+            })
+        .wait();
+
+#ifdef DEVICE_TIMER
+    auto stop_kernel_1 = std::chrono::steady_clock::now();
+    matchTime += std::chrono::duration<float, std::micro>(stop_kernel_1 - start_kernel_1).count();
+    // printf("ComputeHomographies time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel_1 - start_kernel_1).count());
+#endif
+    checkMsg("ComputeHomographies() execution failed\n");
+    sycl::range<3> blocks(1, numLoops / TESTHOMO_LOOPS, 1);
+    sycl::range<3> threads(1, TESTHOMO_LOOPS, TESTHOMO_TESTS);
+#ifdef DEVICE_TIMER
+    auto start_kernel_2 = std::chrono::steady_clock::now();
+#endif
+    q_ct.submit([&](sycl::handler &cgh)
+                {
+                                       sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           homo_acc_ct1(sycl::range<1>(128 /*8*TESTHOMO_LOOPS*/), cgh);
+                                       sycl::accessor<int, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           cnts_acc_ct1(sycl::range<1>(256 /*TESTHOMO_TESTS*TESTHOMO_LOOPS*/),
+                                                        cgh);
+
+                                       cgh.parallel_for(sycl::nd_range<3>(blocks * threads, threads),
+                                                        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]]
+                                                        {
+                                                          TestHomographies(d_coord, d_homo, d_randPts, numPtsUp,
+                                                                           thresh * thresh, item_ct1,
+                                                                           homo_acc_ct1.get_pointer(),
+                                                                           cnts_acc_ct1.get_pointer());
+                                                        }); })
+        .wait();
+#ifdef DEVICE_TIMER
+    auto stop_kernel_2 = std::chrono::steady_clock::now();
+    matchTime += std::chrono::duration<float, std::micro>(stop_kernel_2 - start_kernel_2).count();
+    // printf("TestHomographies time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel_2 - start_kernel_2).count());
+#endif
+    checkMsg("TestHomographies() execution failed\n");
+#ifdef DEVICE_TIMER
+    auto start_memcpy_3 = std::chrono::steady_clock::now();
+#endif
+    q_ct.memcpy(h_randPts, d_randPts, sizeof(int) * numLoops).wait();
+#ifdef DEVICE_TIMER
+    auto stop_memcpy_3 = std::chrono::steady_clock::now();
+    matchTime += std::chrono::duration<float, std::micro>(stop_memcpy_3 - start_memcpy_3).count();
+#endif
+    int maxIndex = -1, maxCount = -1;
+
+    for (int i = 0; i < numLoops; i++)
+      if (h_randPts[i] > maxCount)
+      {
+        maxCount = h_randPts[i];
+        maxIndex = i;
+      }
+
+    *numMatches = maxCount;
+#ifdef DEVICE_TIMER
+    auto start_memcpy_4 = std::chrono::steady_clock::now();
+#endif
+    safeCall((infra::sift_memcpy(homography, szFl, &d_homo[maxIndex],
+                                 sizeof(float) * numLoops, szFl, 8,
+                                 infra::device_to_host, q_ct),
+              0));
+    q_ct.wait();
+#ifdef DEVICE_TIMER
+    auto stop_memcpy_4 = std::chrono::steady_clock::now();
+    matchTime += std::chrono::duration<float, std::micro>(stop_memcpy_4 - start_memcpy_4).count();
+#endif
+  }
+  free(validPts);
+  free(h_randPts);
+  safeCall((sycl::free(d_homo, q_ct), 0));
+  safeCall((sycl::free(d_randPts, q_ct), 0));
+  safeCall((sycl::free(d_coord, q_ct), 0));
+  return matchTime;
+}
+
+double MatchSiftData(SiftData &data1, SiftData &data2, sycl::queue &q_ct, float &matchTime)
+{
+  float matchSiftDataTime = 0.0;
+
+  int numPts1 = data1.numPts;
+  int numPts2 = data2.numPts;
+
+  if (!numPts1 || !numPts2)
+    return 0.0;
+#ifdef MANAGEDMEM
+  SiftPoint *sift1 = data1.m_data;
+  SiftPoint *sift2 = data2.m_data;
+#else
+  if (data1.d_data == NULL || data2.d_data == NULL)
+    return 0.0f;
+  SiftPoint *sift1 = data1.d_data;
+  SiftPoint *sift2 = data2.d_data;
+#endif
+// Original version with correlation and maximization in two different kernels
+// Global memory reguirement: O(N^2)
+#if 0
+  float *d_corrData; 
+  int corrWidth = iDivUp(numPts2, 16)*16;
+  int corrSize = sizeof(float)*numPts1*corrWidth;
+  safeCall(cudaMalloc((void **)&d_corrData, corrSize));
+#if 0 // K40c 10.9ms, 1080 Ti 3.8ms
+  dim3 blocks1(numPts1, iDivUp(numPts2, 16));
+  dim3 threads1(16, 16); // each block: 1 points x 16 points
+  MatchSiftPoints<<<blocks1, threads1>>>(sift1, sift2, d_corrData, numPts1, numPts2);
+#else // K40c 7.6ms, 1080 Ti 1.4ms
+  dim3 blocks(iDivUp(numPts1,16), iDivUp(numPts2, 16));
+  dim3 threads(16, 16); // each block: 16 points x 16 points
+  MatchSiftPoints2<<<blocks, threads>>>(sift1, sift2, d_corrData, numPts1, numPts2);
+#endif
+  safeCall(cudaDeviceSynchronize());
+  dim3 blocksMax(iDivUp(numPts1, 16));
+  dim3 threadsMax(16, 16);
+  FindMaxCorr<<<blocksMax, threadsMax>>>(d_corrData, sift1, sift2, numPts1, corrWidth, sizeof(SiftPoint));
+  safeCall(cudaDeviceSynchronize());
+  checkMsg("FindMaxCorr() execution failed\n");
+  safeCall(cudaFree(d_corrData));
+#endif
+
+// Version suggested by Nicholas Lin with combined correlation and maximization
+// Global memory reguirement: O(N)
+#if 0
+  int block_dim = 16;
+  float *d_corrData;
+  int corrSize = numPts1 * block_dim * 2;
+  safeCall(cudaMalloc((void **)&d_corrData, sizeof(float) * corrSize));
+  dim3 blocks(iDivUp(numPts1, block_dim));
+  dim3 threads(block_dim, block_dim); 
+  FindMaxCorr3<<<blocks, threads >>>(d_corrData, sift1, sift2, numPts1, numPts2);
+  safeCall(cudaDeviceSynchronize());
+  checkMsg("FindMaxCorr3() execution failed\n");
+  safeCall(cudaFree(d_corrData));
+#endif
+
+// Combined version with no global memory requirement using one 1 point per block
+#if 0
+  dim3 blocksMax(numPts1);
+  dim3 threadsMax(FMC2W, FMC2H);
+  FindMaxCorr2<<<blocksMax, threadsMax>>>(sift1, sift2, numPts1, numPts2);
+  safeCall(cudaDeviceSynchronize());
+  checkMsg("FindMaxCorr2() execution failed\n");
+#endif
+
+// Combined version with no global memory requirement using one FMC2H points per block
+#if 0
+  dim3 blocksMax2(iDivUp(numPts1, FMC2H));
+  dim3 threadsMax2(FMC2W, FMC2H);
+  FindMaxCorr4<<<blocksMax2, threadsMax2>>>(sift1, sift2, numPts1, numPts2);
+  safeCall(cudaDeviceSynchronize());
+  checkMsg("FindMaxCorr4() execution failed\n");
+#endif
+
+// Combined version with no global memory requirement using global locks
+#if 1
+  sycl::range<3> blocksMax3(1, iDivUp(numPts2, 512), iDivUp(numPts1, 16));
+  sycl::range<3> threadsMax3(1, 16, 16);
+#ifdef DEVICE_TIMER
+  auto start_kernel1 = std::chrono::steady_clock::now();
+#endif
+
+  q_ct.parallel_for(
+          sycl::nd_range<3>(sycl::range<3>(1, 1, iDivUp(numPts1, 64)) *
+                                sycl::range<3>(1, 1, 64),
+                            sycl::range<3>(1, 1, 64)),
+          [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]]
+          {
+            CleanMatches(sift1, numPts1, item_ct1);
+          })
+      .wait();
+
+#ifdef DEVICE_TIMER
+  auto stop_kernel1 = std::chrono::steady_clock::now();
+  // printf("CleanMatches time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel1 - start_kernel1).count());
+
+  matchTime += std::chrono::duration<float, std::micro>(stop_kernel1 - start_kernel1).count();
+  matchSiftDataTime += std::chrono::duration<float, std::micro>(stop_kernel1 - start_kernel1).count();
+#endif
+
+  int mode = 10;
+  if (mode == 5)
+    q_ct.submit([&](sycl::handler &cgh)
+                {
+                                       lock.init();
+
+                                       auto lock_ptr_ct1 = lock.get_ptr();
+
+                                       sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           siftParts1_acc_ct1(sycl::range<1>(272 /*17*16*/), cgh);
+                                       sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           siftParts2_acc_ct1(sycl::range<1>(272 /*17*16*/), cgh);
+
+                                       cgh.parallel_for(sycl::nd_range<3>(blocksMax3 * threadsMax3, threadsMax3),
+                                                        [=](sycl::nd_item<3> item_ct1)[[intel::reqd_sub_group_size(
+                                                                                           32)]]
+                                                        {
+                                                          FindMaxCorr5(sift1, sift2, numPts1, numPts2, item_ct1,
+                                                                       lock_ptr_ct1,
+                                                                       siftParts1_acc_ct1.get_pointer(),
+                                                                       siftParts2_acc_ct1.get_pointer());
+                                                        }); });
+  else if (mode == 6)
+  {
+    threadsMax3 = sycl::range<3>(1, 16, 32);
+    q_ct.submit([&](sycl::handler &cgh)
+                {
+                                       lock.init();
+
+                                       auto lock_ptr_ct1 = lock.get_ptr();
+
+                                       sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           siftParts2_acc_ct1(sycl::range<1>(2048 /*128*16*/), cgh);
+                                       sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           sums_acc_ct1(sycl::range<1>(256 /*16*16*/), cgh);
+
+                                       cgh.parallel_for(
+                                           sycl::nd_range<3>(blocksMax3 * threadsMax3, threadsMax3),
+                                           [=](sycl::nd_item<3> item_ct1)[[intel::reqd_sub_group_size(
+                                                                                           32)]]
+                                              {                                                
+                                                 FindMaxCorr6(sift1, sift2, numPts1, numPts2, item_ct1,
+                                                              lock_ptr_ct1, siftParts2_acc_ct1.get_pointer(),
+                                                              sums_acc_ct1.get_pointer());
+                                               }); });
+  }
+  else if (mode == 7)
+    q_ct.submit([&](sycl::handler &cgh)
+                {
+                                       lock.init();
+
+                                       auto lock_ptr_ct1 = lock.get_ptr();
+
+                                       sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           siftParts1_acc_ct1(sycl::range<1>(1088 /*17*64*/), cgh);
+                                       sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           siftParts2_acc_ct1(sycl::range<1>(1024 /*16*64*/), cgh);
+
+                                       cgh.parallel_for(sycl::nd_range<3>(blocksMax3 * threadsMax3, threadsMax3),
+                                                        [=](sycl::nd_item<3> item_ct1)[[intel::reqd_sub_group_size(
+                                                                                           32)]]
+                                                        {
+                                                          FindMaxCorr7(sift1, sift2, numPts1, numPts2, item_ct1,
+                                                                       lock_ptr_ct1,
+                                                                       siftParts1_acc_ct1.get_pointer(),
+                                                                       siftParts2_acc_ct1.get_pointer());
+                                                        }); });
+  else if (mode == 8)
+  {
+    blocksMax3 =
+        sycl::range<3>(1, iDivUp(numPts2, FMC_GH), iDivUp(numPts1, FMC_BW));
+    threadsMax3 = sycl::range<3>(1, FMC_NH, FMC_NW);
+    q_ct.submit([&](sycl::handler &cgh)
+                {
+                                       lock.init();
+
+                                       auto lock_ptr_ct1 = lock.get_ptr();
+
+                                       sycl::accessor<sycl::float4, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           siftParts1_acc_ct1(sycl::range<1>(512 /*FMC_BW*FMC_BD*/), cgh);
+                                       sycl::accessor<sycl::float4, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           siftParts2_acc_ct1(sycl::range<1>(512 /*FMC_BH*FMC_BD*/), cgh);
+                                       sycl::accessor<float, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           blksums_acc_ct1(sycl::range<1>(1024 /*FMC_BW*FMC_BH*/), cgh);
+
+                                       cgh.parallel_for(sycl::nd_range<3>(blocksMax3 * threadsMax3, threadsMax3),
+                                                        [=](sycl::nd_item<3> item_ct1)[[intel::reqd_sub_group_size(
+                                                                                           32)]]
+                                                        {
+                                                          FindMaxCorr8(sift1, sift2, numPts1, numPts2, item_ct1,
+                                                                       lock_ptr_ct1,
+                                                                       siftParts1_acc_ct1.get_pointer(),
+                                                                       siftParts2_acc_ct1.get_pointer(),
+                                                                       blksums_acc_ct1.get_pointer());
+                                                        }); });
+  }
+  else if (mode == 9)
+  {
+    blocksMax3 =
+        sycl::range<3>(1, iDivUp(numPts2, FMC_GH), iDivUp(numPts1, FMC_BW));
+    threadsMax3 = sycl::range<3>(1, FMC_NH, FMC_NW);
+    q_ct.submit([&](sycl::handler &cgh)
+                {
+                                       lock.init();
+
+                                       auto lock_ptr_ct1 = lock.get_ptr();
+
+                                       sycl::accessor<sycl::float4, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           siftParts1_acc_ct1(sycl::range<1>(512 /*FMC_BW*FMC_BD*/), cgh);
+                                       sycl::accessor<sycl::float4, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           siftParts2_acc_ct1(sycl::range<1>(512 /*FMC_BH*FMC_BD*/), cgh);
+
+                                       cgh.parallel_for(sycl::nd_range<3>(blocksMax3 * threadsMax3, threadsMax3),
+                                                        [=](sycl::nd_item<3> item_ct1)[[intel::reqd_sub_group_size(
+                                                                                           32)]]
+                                                        {
+                                                          FindMaxCorr9(sift1, sift2, numPts1, numPts2, item_ct1,
+                                                                       lock_ptr_ct1,
+                                                                       siftParts1_acc_ct1.get_pointer(),
+                                                                       siftParts2_acc_ct1.get_pointer());
+                                                        }); });
+  }
+  else if (mode == 10)
+  {
+    try
+    {
+
+      blocksMax3 = sycl::range<3>(1, 1, iDivUp(numPts1, M7W));
+      threadsMax3 = sycl::range<3>(1, (M7H / M7R), M7W); //(1 , 8 , 32)
+
+#ifdef DEVICE_TIMER
+      auto start_kernel2 = std::chrono::steady_clock::now();
+#endif
+      q_ct.submit([&](sycl::handler &cgh)
+                  {
+                                       sycl::accessor<sycl::float4, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                           buffer1_acc_ct1(sycl::range<1>(1024 /*M7W*NDIM/4*/), cgh);
+                                          // buffer1_acc_ct1(sycl::range<1>(M7W*NDIM/4), cgh);
+                                       sycl::accessor<sycl::float4, 1, sycl::access_mode::read_write,
+                                                      sycl::access::target::local>
+                                            buffer2_acc_ct1(sycl::range<1>(1024 /*M7H*NDIM/4*/), cgh);
+                                          //  buffer2_acc_ct1(sycl::range<1>(M7H*NDIM/4), cgh);
+
+                                       cgh.parallel_for(sycl::nd_range<3>(blocksMax3 * threadsMax3, threadsMax3),
+                                                        [=](sycl::nd_item<3> item_ct1)
+                                                        [[intel::reqd_sub_group_size(32)]]
+                                                        {
+                                                          FindMaxCorr10(sift1, sift2, numPts1, numPts2, item_ct1,
+                                                                        buffer1_acc_ct1.get_pointer(),
+                                                                        buffer2_acc_ct1.get_pointer());
+                                                        }); })
+          .wait();
+#ifdef DEVICE_TIMER
+      auto stop_kernel2 = std::chrono::steady_clock::now();
+      // printf("FindMaxCorr10 time =          %.2f us\n", std::chrono::duration<float, std::micro>(stop_kernel2 - start_kernel2).count());
+      matchTime += std::chrono::duration<float, std::micro>(stop_kernel2 - start_kernel2).count();
+      matchSiftDataTime += std::chrono::duration<float, std::micro>(stop_kernel2 - start_kernel2).count();
+#endif
+    }
+    catch (sycl::exception const &e)
+    {
+      std::cerr << e.what() << '\n';
+    }
+  }
+  checkMsg("FindMaxCorr5() execution failed\n");
+#endif
+
+  if (data1.h_data != NULL)
+  {
+    float *h_ptr = &data1.h_data[0].score;
+    float *d_ptr = &data1.d_data[0].score;
+#ifdef DEVICE_TIMER
+    auto start_memcpy = std::chrono::steady_clock::now();
+#endif
+    // infra::sift_memcpy(h_ptr, sizeof(SiftPoint), d_ptr, sizeof(SiftPoint), 5 * sizeof(float), data1.numPts, infra::device_to_host, q_ct);
+    infra::sift_memcpy(h_ptr, d_ptr, sizeof(SiftPoint) * data1.numPts, infra::device_to_host, q_ct);
+    q_ct.wait();
+#ifdef DEVICE_TIMER
+    auto stop_memcpy = std::chrono::steady_clock::now();
+    matchTime += std::chrono::duration<float, std::micro>(stop_memcpy - start_memcpy).count();
+    matchSiftDataTime += std::chrono::duration<float, std::micro>(stop_memcpy - start_memcpy).count();
+#endif
+  }
+  return matchTime;
+}