diff --git a/.github/workflows/C++.yml b/.github/workflows/C++.yml
index 80d4e9ef6..ee37701cb 100644
--- a/.github/workflows/C++.yml
+++ b/.github/workflows/C++.yml
@@ -1,11 +1,6 @@
 name: C++ Build
 
 on:
-  push:
-    branches:
-      - master
-    tags:
-      - v*
   pull_request:
     branches:
       - master
diff --git a/.github/workflows/build_cufinufft_wheels.yml b/.github/workflows/build_cufinufft_wheels.yml
index bc876305e..85526f077 100644
--- a/.github/workflows/build_cufinufft_wheels.yml
+++ b/.github/workflows/build_cufinufft_wheels.yml
@@ -1,6 +1,22 @@
 name: Build cufinufft Python wheels
 
-on: [push, pull_request]
+on:
+  # Weekly schedule (Sunday 00:00 UTC)
+  schedule:
+    - cron: "0 0 * * 0"
+
+  # On tag push
+  push:
+    tags:
+      - 'v*'
+  # Manual trigger (with an optional deploy toggle)
+  workflow_dispatch:
+    inputs:
+      deploy:
+        description: "Deploy after build?"
+        type: boolean
+        required: false
+        default: false
 
 jobs:
   build_wheels:
@@ -17,9 +33,16 @@ jobs:
       - uses: ilammy/msvc-dev-cmd@v1
       - name: Setup CUDA
         if: ${{ matrix.buildplat[0] == 'windows-2022' }}
-        uses: Jimver/cuda-toolkit@v0.2.21
+        uses: Jimver/cuda-toolkit@v0.2.24
         with:
           cuda: '12.4.0'
+          method: 'network'
+          linux-local-args: '["--toolkit"]'
+          log-file-suffix: '${{matrix.os}}-cuda-install.txt'
+          sub-packages: ${{runner.os == 'Linux' && '["nvcc", "cudart"]' || '["nvcc", "cudart", "cufft", "cufft_dev"]'}}
+          non-cuda-sub-packages: ${{runner.os == 'Linux' && '["libcufft","libcufft-dev"]' || '[]'}}
+          use-local-cache: 'false'
+          use-github-cache: 'false' #it is not working at the moment https://github.com/Jimver/cuda-toolkit/issues/390
       - name: Build ${{ matrix.buildplat[1] }} wheels
         uses: pypa/cibuildwheel@v2.22.0
         with:
diff --git a/.github/workflows/build_finufft_wheels.yml b/.github/workflows/build_finufft_wheels.yml
index 3798556df..d9de5e030 100644
--- a/.github/workflows/build_finufft_wheels.yml
+++ b/.github/workflows/build_finufft_wheels.yml
@@ -1,6 +1,23 @@
 name: Build and test finufft Python wheels
 
-on: [ push, pull_request ]
+on:
+  # Weekly schedule (Sunday 00:00 UTC)
+  schedule:
+    - cron: "0 0 * * 0"
+
+  # On tag push
+  push:
+    tags:
+      - 'v*'
+  # Manual trigger (with an optional deploy toggle)
+  workflow_dispatch:
+    inputs:
+      deploy:
+        description: "Deploy after build?"
+        type: boolean
+        required: false
+        default: false
+
 
 jobs:
   build_wheels:
diff --git a/.github/workflows/cmake_ci.yml b/.github/workflows/cmake_ci.yml
index 29376e3b9..9fa89b20b 100644
--- a/.github/workflows/cmake_ci.yml
+++ b/.github/workflows/cmake_ci.yml
@@ -3,18 +3,6 @@ name: cmake ci linux macos windows
 on: [push, pull_request]
 
 jobs:
-  prepare:
-    runs-on: ubuntu-22.04
-    outputs:
-      matrix: ${{ steps.generate_matrix.outputs.matrix }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-      - name: Generate matrix
-        id: generate_matrix
-        run: |
-          MATRIX=$(python3 ${{ github.workspace }}/.github/workflows/generate_cmake_matrix.py)
-          echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
   pip-requirements:
     runs-on: ubuntu-22.04
     steps:
@@ -23,7 +11,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          cache : 'pip'
+          cache: 'pip'
           python-version: '3.10'
       - name: Generate requirements.txt
         run: |
@@ -38,142 +26,193 @@ jobs:
         with:
           name: requirements
           path: requirements.txt
+
   cache:
     strategy:
       matrix:
-        os:
-         - ubuntu-22.04
-         - windows-2022
+        os: [ubuntu-22.04, windows-2022]
     runs-on: ${{ matrix.os }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
       - name: create cache directory
-        run: |
-          mkdir -p cpm
+        shell: bash
+        run: mkdir -p "cpm"
       - name: Check if cache exists
         id: cache
         uses: actions/cache@v4
         with:
-          key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('CMakeLists.txt', 'cmake/*') }}
+          key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('**/CMakeLists.txt', 'cmake/**') }}
           enableCrossOsArchive: true
           path: cpm
-      - name: Setup Cpp
+      - name: Setup Cpp (on cache miss)
         if: steps.cache.outputs.cache-hit != 'true'
         uses: aminya/setup-cpp@v1
         with:
-          cmake : true
-      - name: Download dependencies in cache linux
+          cmake: true
+      - name: Download dependencies in cache (Linux)
         if: steps.cache.outputs.cache-hit != 'true' && runner.os == 'Linux'
         run: |
-          cmake -S . -B ./build
+          cmake -S . -B ./build -DCPM_SOURCE_CACHE="cpm"
           rm -rf build
-          cmake -S . -B ./build -DFINUFFT_USE_DUCC0=ON
-        env:
-          CPM_SOURCE_CACHE: cpm
-      - name: Download dependencies in cache windows
+          cmake -S . -B ./build -DFINUFFT_USE_DUCC0=ON -DCPM_SOURCE_CACHE="cpm"
+      - name: Download dependencies in cache (Windows)
         if: steps.cache.outputs.cache-hit != 'true' && runner.os != 'Linux'
         run: |
-          cmake -S . -B build
+          cmake -S . -B build -DCPM_SOURCE_CACHE="cpm"
           rm build -r -force
-          cmake -S . -B build -DFINUFFT_USE_DUCC0=ON
-        env:
-          CPM_SOURCE_CACHE: cpm
+          cmake -S . -B build -DFINUFFT_USE_DUCC0=ON -DCPM_SOURCE_CACHE="cpm"
       - name: Cache dependencies
+        if: steps.cache.outputs.cache-hit != 'true'
         uses: actions/cache/save@v4
         with:
-          key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('CMakeLists.txt', 'cmake/*') }}
+          key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('**/CMakeLists.txt', 'cmake/**') }}
           enableCrossOsArchive: true
           path: cpm
+
   cmake-ci:
     runs-on: ${{ matrix.os }}
-    needs: [prepare, cache, pip-requirements]
+    needs: [cache, pip-requirements]
+    env:
+      CMAKE_GENERATOR: Ninja
+      CPM_SOURCE_CACHE: cpm
+      CTEST_OUTPUT_ON_FAILURE: "1"
+      SCCACHE_GHA_ENABLED: "true"
+      SCCACHE_GHA_VERSION: "0"
+      CMAKE_C_COMPILER_LAUNCHER: sccache
+      CMAKE_CXX_COMPILER_LAUNCHER: sccache
+      CMAKE_CUDA_COMPILER_LAUNCHER: sccache
     strategy:
       fail-fast: false
-      matrix: ${{ fromJSON(needs.prepare.outputs.matrix) }}
+      matrix:
+        include:
+          - { os: ubuntu-22.04, toolchain: gcc-10 }
+          - { os: ubuntu-22.04, toolchain: gcc-11 }
+          - { os: ubuntu-22.04, toolchain: gcc-12 }
+          - { os: ubuntu-22.04, toolchain: gcc-13 }
+          - { os: ubuntu-22.04, toolchain: clang-16 }
+          - { os: ubuntu-22.04, toolchain: clang-17 }
+          - { os: ubuntu-22.04, toolchain: clang-18 }
+
+          - { os: windows-2022, toolchain: msvc }
+          - { os: windows-2022, toolchain: clang-19 }
+
+          - { os: macos-13, toolchain: llvm }
+          - { os: macos-13, toolchain: gcc-14 }
+          - { os: macos-14, toolchain: llvm }
+          - { os: macos-14, toolchain: gcc-14 }
+          - { os: macos-15, toolchain: llvm }
+#          - { os: macos-15, toolchain: gcc-15 } # it is broken at the moment, but it is not caused by finufft
+
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
+
       - name: Restore Cache
         uses: actions/cache/restore@v4
         with:
-          key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('CMakeLists.txt', 'cmake/*') }}
+          key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('**/CMakeLists.txt', 'cmake/**') }}
           enableCrossOsArchive: true
           path: cpm
+
       - name: Download requirements.txt
         uses: actions/download-artifact@v4
         with:
           name: requirements
+
       - name: Run sccache-cache
         uses: mozilla-actions/sccache-action@v0.0.9
-      - name: Set caching env vars
-        run: |
-          echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV
-          echo "SCCACHE_GHA_VERSION=0" >> $GITHUB_ENV
-      - name: Set up sccache on Windows
-        if: runner.os == 'Windows'
-        run: Add-Content -Path $env:GITHUB_ENV -Value "SCCACHE_GHA_ENABLED=true"
+
       - name: Setup Cpp
-        uses: aminya/setup-cpp@v1.1.1
+        uses: aminya/setup-cpp@v1
         with:
           compiler: ${{ matrix.toolchain }}
-          vcvarsall: ${{ contains(matrix.os, 'windows') }}
+          vcvarsall: true
           cmake: true
           ninja: true
-          vcpkg: false
-          cppcheck: false
-          clangtidy: false
-      - name: Set min macOS version and install fftw
-        if: runner.os == 'macOS'
+
+      - name: Install toolchain + FFTW (macOS)
+        if: startsWith(matrix.os, 'macos-')
         run: |
-          brew install fftw libomp
-      - name: Install fftw
-        if: runner.os == 'linux'
+          export HOMEBREW_NO_ANALYTICS=1 HOMEBREW_NO_INSTALL_CLEANUP=1
+          brew install libomp fftw
+          {
+            echo "LDFLAGS=-L$(brew --prefix libomp)/lib"
+            echo "CPPFLAGS=-I$(brew --prefix libomp)/include"
+            echo "LIBRARY_PATH=$(brew --prefix libomp)/lib"
+            echo "DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib"
+            echo "CMAKE_PREFIX_PATH=$(brew --prefix libomp)"
+            ver="${{ matrix.os }}"; ver="${ver#macos-}"
+            echo "MACOSX_DEPLOYMENT_TARGET=${ver}.0"
+          } >> "$GITHUB_ENV"
+
+      - name: Install fftw (Linux)
+        if: runner.os == 'Linux'
         run: |
           sudo apt update
           sudo apt install -y libfftw3-dev
-      - name: Configure Cmake
-        run: |
-          cmake -S . -B ./build -G Ninja -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE:STRING=${{matrix.build_type}} -DFINUFFT_ARCH_FLAGS=${{ matrix.arch_flags }} -DFINUFFT_BUILD_TESTS=ON -DFINUFFT_STATIC_LINKING=${{matrix.finufft_static_linking}} -DFINUFFT_USE_DUCC0=${{ matrix.ducc_fft }} -DFINUFFT_ENABLE_SANITIZERS=${{matrix.build_type == 'Debug'}}
-        env:
-          CPM_SOURCE_CACHE: cpm
-      - name: Build
-        run: |
-          cmake --build ./build --config ${{matrix.build_type}}
-      - name: Test
-        working-directory: ./build
-        run: |
-          ctest -C ${{matrix.build_type}} --output-on-failure
+
       - name: Set up Python
-        if: matrix.finufft_static_linking == 'off'
         uses: actions/setup-python@v5
         with:
           python-version: '3.10'
           cache: 'pip'
           cache-dependency-path: requirements.txt
-      - name: Build Python wheels
-        if: matrix.finufft_static_linking == 'off'
-        env:
-          MACOSX_DEPLOYMENT_TARGET: 13
-          CPM_SOURCE_CACHE: cpm
-        shell: bash
+
+      - name: Install Python requirements
         run: |
           python3 -m pip install --upgrade pip
           python3 -m pip install -r requirements.txt
-          python3 -m pip install \
-            --verbose \
-            -C cmake.define.CMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -C cmake.define.FINUFFT_ARCH_FLAGS=${{ matrix.arch_flags }} \
-            -C cmake.define.FINUFFT_USE_DUCC0=${{ matrix.ducc_fft }} \
-            -C cmake.define.CMAKE_CXX_COMPILER_LAUNCHER=sccache \
-            -C cmake.define.CMAKE_C_COMPILER_LAUNCHER=sccache \
-            -C cmake.define.CMAKE_GENERATOR=Ninja \
-            -C cmake.define.CMAKE_GENERATOR_PLATFORM= \
-            python/finufft
-      - name: Test Python package
-        if: matrix.finufft_static_linking == 'off'
+
+      - name: Build and test configurations
+        shell: bash
         run: |
-          python3 -m pytest python/finufft/test
+          if [[ "$RUNNER_OS" == "Windows" ]]; then
+            configs=("build_type=Release ducc_fft=On" "build_type=Debug ducc_fft=On")
+          else
+            configs=("build_type=Release ducc_fft=On" "build_type=Release ducc_fft=Off" \
+                     "build_type=Debug ducc_fft=On"  "build_type=Debug ducc_fft=Off")
+          fi
+
+          if [[ "${{ matrix.toolchain }}" == "msvc" ]]; then
+            arch_flags=("/arch:AVX2" "/arch:SSE2" "native")
+          elif [[ "$RUNNER_OS" == "Linux" ]]; then
+            arch_flags=("-march=x86-64" "-march=x86-64-v2" "-march=x86-64-v3" "-march=native")
+          else
+            arch_flags=("native")
+          fi
+
+          for arch in "${arch_flags[@]}"; do
+            arch_dir=$(echo "$arch" | tr -c 'A-Za-z0-9' '_')
+            for cfg in "${configs[@]}"; do
+              eval "$cfg"
+              build_dir="build_${build_type}_${ducc_fft}_${arch_dir}"
+
+              cmake -E make_directory "$build_dir"
+              cmake -S . -B "$build_dir" \
+                -DCMAKE_BUILD_TYPE=$build_type \
+                -DFINUFFT_ARCH_FLAGS="$arch" \
+                -DFINUFFT_USE_DUCC0=$ducc_fft \
+                -DCMAKE_MSVC_DEBUG_INFORMATION_FORMAT=Embedded \
+                -DCMAKE_POLICY_DEFAULT_CMP0141=NEW
+                # CMake will pick the toolchain set by setup-cpp (and macOS brew block for gcc-14).
+
+              cmake --build "$build_dir" --config "$build_type"
+              (cd "$build_dir" && ctest -C "$build_type")
+
+              python3 -m pip install \
+                --verbose \
+                -C cmake.define.CMAKE_BUILD_TYPE=$build_type \
+                -C cmake.define.FINUFFT_ARCH_FLAGS="$arch" \
+                -C cmake.define.FINUFFT_USE_DUCC0=$ducc_fft \
+                -C cmake.define.CMAKE_MSVC_DEBUG_INFORMATION_FORMAT=Embedded \
+                -C cmake.define.CMAKE_POLICY_DEFAULT_CMP0141=NEW \
+                python/finufft
+
+              python3 -m pytest python/finufft/test
+            done
+          done
+
   cleanup:
     runs-on: ubuntu-22.04
     needs: cmake-ci
diff --git a/.github/workflows/fortran.yml b/.github/workflows/fortran.yml
new file mode 100644
index 000000000..3c5d08118
--- /dev/null
+++ b/.github/workflows/fortran.yml
@@ -0,0 +1,77 @@
+name: finufft fortran ctest
+
+on:
+  # Weekly schedule (Sunday 00:00 UTC)
+  schedule:
+    - cron: "0 0 * * 0"
+
+  # On tag push
+  push:
+    tags:
+      - 'v*'
+  # Manual trigger (with an optional deploy toggle)
+  workflow_dispatch:
+    inputs:
+      deploy:
+        description: "Deploy after build?"
+        type: boolean
+        required: false
+        default: false
+jobs:
+  build-and-test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-22.04, macos-14, windows-2022 ]
+
+    env:
+      CTEST_OUTPUT_ON_FAILURE: "1"
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        # Single, simple Fortran toolchain everywhere (gfortran).
+        # This action also sets FC, and typically CC/CXX to compatible compilers.
+      - name: Setup Fortran
+        id: setup-fortran
+        uses: fortran-lang/setup-fortran@v1
+
+        # CMake + Ninja (uniform across OSes)
+      - name: Setup CMake & Ninja
+        uses: aminya/setup-cpp@v1
+        with:
+          cmake: true
+          ninja: true
+      - name: Install toolchain + FFTW (macOS)
+        if: startsWith(matrix.os, 'macos-')
+        run: |
+          export HOMEBREW_NO_ANALYTICS=1 HOMEBREW_NO_INSTALL_CLEANUP=1
+          brew install libomp
+          {
+            echo "LDFLAGS=-L$(brew --prefix libomp)/lib"
+            echo "CPPFLAGS=-I$(brew --prefix libomp)/include"
+            echo "LIBRARY_PATH=$(brew --prefix libomp)/lib"
+            echo "DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib"
+            echo "CMAKE_PREFIX_PATH=$(brew --prefix libomp)"
+          } >> "$GITHUB_ENV"
+      - name: Configure (RelWithDebInfo + examples/fortran/tests)
+        shell: bash
+        run: |
+          cmake -S . -B build -G Ninja \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DCMAKE_C_COMPILER="${CC}" \
+            -DCMAKE_CXX_COMPILER="${CXX}" \
+            -DCMAKE_Fortran_COMPILER="${FC}" \
+            -DFINUFFT_USE_DUCC0=ON \
+            -DFINUFFT_BUILD_EXAMPLES=ON \
+            -DFINUFFT_BUILD_FORTRAN=ON \
+            -DFINUFFT_BUILD_TESTS=ON
+
+      - name: Build
+        shell: bash
+        run: cmake --build build --config RelWithDebInfo --parallel
+
+      - name: Test
+        shell: bash
+        run: ctest --test-dir build --output-on-failure
diff --git a/.github/workflows/generate_cmake_matrix.py b/.github/workflows/generate_cmake_matrix.py
deleted file mode 100644
index 937765e90..000000000
--- a/.github/workflows/generate_cmake_matrix.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import json
-
-matrix = {
-    "include": []
-}
-
-static_linking = ["On", "Off"]
-
-combinations = [
-    ("ubuntu-22.04", {
-        "build_type": ["Release", "Debug"],
-        "toolchain": ["llvm", "gcc"],
-        "arch_flags": ["-march=native", "-march=x86-64", "native"],
-        "ducc_fft": ["On", "Off"]
-    }),
-    ("windows-2022", {
-        "build_type": ["Release", "Debug"],
-        "toolchain": ["msvc"],
-        "arch_flags": ["/arch:AVX2", "/arch:SSE2", "native"],
-        "ducc_fft": ["On", "Off"]
-    }),
-    ("windows-2022", {
-        "build_type": ["Release"],
-        "toolchain": ["llvm"],
-        "arch_flags": ["-march=native", "-march=x86-64", "native"],
-        "ducc_fft": ["On", "Off"]
-    }),
-    ("macos-13", {
-        "build_type": ["Release", "Debug"],
-        "toolchain": ["llvm", "gcc-14"],
-        "arch_flags": ["-march=native", "-march=x86-64", "native"],
-        "ducc_fft": ["On", "Off"]
-    })
-]
-
-def get_c_compiler(toolchain):
-    if "gcc" in toolchain:
-        return "gcc"
-    elif toolchain == "llvm":
-        return "clang"
-    elif toolchain == "msvc":
-        return "cl"
-    else:
-        raise ValueError(f"Unknown toolchain: {toolchain}")
-
-
-def get_cxx_compiler(toolchain):
-    if "gcc" in toolchain:
-        return "g++"
-    elif toolchain == "llvm":
-        return "clang++"
-    elif toolchain == "msvc":
-        return "cl"
-    else:
-        raise ValueError(f"Unknown toolchain: {toolchain}")
-
-
-for platform, value in combinations:
-    for toolchain in value["toolchain"]:
-        for arch_flag in value["arch_flags"]:
-            for linking in static_linking:
-                for build in value["build_type"]:
-                    for ducc in value["ducc_fft"]:
-                        matrix["include"].append({
-                            "os": platform,
-                            "toolchain": toolchain,
-                            "arch_flags": arch_flag,
-                            "finufft_static_linking": linking,
-                            "build_type": build,
-                            "c_compiler": get_c_compiler(toolchain),
-                            "cxx_compiler": get_cxx_compiler(toolchain),
-                            "ducc_fft": ducc,
-                        })
-json_str = json.dumps(matrix, ensure_ascii=False)
-print(json_str)
diff --git a/.github/workflows/generate_matrix.py b/.github/workflows/generate_matrix.py
deleted file mode 100644
index 29049e864..000000000
--- a/.github/workflows/generate_matrix.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import json
-
-matrix = {
-    "include": []
-}
-
-python_versions = ["3.8", "3.11"]
-
-combinations = {
-    "ubuntu-22.04": {
-        "compiler": ["llvm", "gcc"],
-        "arch_flags": ["-march=native", "-march=x86-64"]
-    },
-    "windows-2022": {
-        "compiler": ["msvc", "llvm"],
-        "arch_flags": ["/arch:AVX2", "/arch:SSE2"]
-    },
-    "macos-13": {
-        "compiler": ["llvm", "gcc-14"],
-        "arch_flags": ["-march=native", "-march=x86-64"]
-    }
-}
-
-for platform in combinations.keys():
-    for python_version in python_versions:
-        for compiler in combinations[platform]["compiler"]:
-            for arch_flag in combinations[platform]["arch_flags"]:
-                matrix["include"].append({
-                    "os": platform,
-                    "python-version": python_version,
-                    "compiler": compiler,
-                    "arch_flags": arch_flag
-                })
-
-json_str = json.dumps(matrix, ensure_ascii=False)
-print(json_str)
diff --git a/.github/workflows/make-mex.yml b/.github/workflows/make-mex.yml
new file mode 100644
index 000000000..bb1b7e350
--- /dev/null
+++ b/.github/workflows/make-mex.yml
@@ -0,0 +1,71 @@
+name: Generate Matlab mex files
+on:
+  # Weekly schedule (Sunday 00:00 UTC)
+  schedule:
+      - cron: "0 0 * * 0"
+
+  # On tag push
+  push:
+    tags:
+      - 'v*'
+  # Manual trigger (with an optional deploy toggle)
+  workflow_dispatch:
+    inputs:
+      deploy:
+        description: "Deploy after build?"
+        type: boolean
+        required: false
+        default: false
+jobs:
+  mex-ci:
+    name: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-22.04, windows-latest, macos-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+      - name: Set up MATLAB
+        uses: matlab-actions/setup-matlab@v2
+        with:
+          release: R2023b
+          cache: false
+      - name: Generate Windows mex file
+        if: runner.os == 'Windows'
+        run: |
+          export PATH=/d/hostedtoolcache/windows/MATLAB/2023.2.999/x64/bin/win64:$PATH
+          export PATH=/c/msys64/usr/bin:$PATH
+          export PATH=/c/msys64/mingw64/bin:$PATH
+          export MW_MINGW64_LOC=/c/msys64/mingw64
+          pacman -Sy --noconfirm make mingw-w64-x86_64-toolchain mingw-w64-x86_64-fftw
+          mex -setup:.github/workflows/mex_C++_win64.xml C++
+          cp make-platforms/make.inc.windows_msys make.inc
+          sed -i '/  LIBSFFT/a \ \ LIBSFFT := `g++ --print-file-name libfftw3.a` `g++ --print-file-name libfftw3f.a` `g++ --print-file-name libfftw3_omp.a` `g++ --print-file-name libfftw3f_omp.a` `g++ --print-file-name libm.a` `g++ --print-file-name libgomp.a`' makefile
+          /c/msys64/usr/bin/make matlab
+        shell: C:\msys64\usr\bin\bash.exe {0}
+      - name: Generate macOS mex file
+        if: runner.os == 'macOS'
+        run: |
+          brew install fftw libomp
+          sed 's/\/Applications\/MATLAB_R20\*\*.app/\/Users\/runner\/hostedtoolcache\/MATLAB\/2023.2.999\/arm64\/MATLAB.app/' make-platforms/make.inc.macosx_arm64 > make.inc
+          sed -i -e 's/  LIBSFFT.*/\ \ LIBSFFT := \/opt\/homebrew\/opt\/fftw\/lib\/libfftw3\.a \/opt\/homebrew\/opt\/fftw\/lib\/libfftw3f\.a \/opt\/homebrew\/opt\/fftw\/lib\/libfftw3_omp.a \/opt\/homebrew\/opt\/fftw\/lib\/libfftw3f_omp\.a -L\/Users\/runner\/hostedtoolcache\/MATLAB\/2023.2.999\/arm64\/MATLAB.app\/bin\/maca64\/lib -lomp/' makefile
+          make matlab
+      - name: Generate Linux mex file
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt install libfftw3-dev
+          sed -i '/  LIBSFFT/a \ \ LIBSFFT := `g++ --print-file-name libfftw3.a` `g++ --print-file-name libfftw3f.a` `g++ --print-file-name libfftw3_omp.a` `g++ --print-file-name libfftw3f_omp.a` -lm -lgomp' makefile
+          make matlab
+      - name: Run Matlab test
+        uses: matlab-actions/run-command@v2
+        with:
+          command: addpath(genpath('matlab')), check_finufft
+      - name: Upload mex files
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{matrix.os}}-R2023b-finufft-mex
+          path: ${{runner.workspace}}/finufft/matlab/finufft.mex*
+      - name: Setup tmate session
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3
diff --git a/.github/workflows/mex.yml b/.github/workflows/mex.yml
index 60177e64d..ecf1a0c97 100644
--- a/.github/workflows/mex.yml
+++ b/.github/workflows/mex.yml
@@ -1,55 +1,122 @@
-name: Generate Matlab mex files
-on: [push]
+name: Build MATLAB MEX
+
+on:
+  # Weekly schedule (Sunday 00:00 UTC)
+  schedule:
+    - cron: "0 0 * * 0"
+
+  # On tag push
+  push:
+    tags:
+      - 'v*'
+  # Manual trigger (with an optional deploy toggle)
+  workflow_dispatch:
+    inputs:
+      deploy:
+        description: "Deploy after build?"
+        type: boolean
+        required: false
+        default: false
 jobs:
-  mex-ci:
-    name: ${{ matrix.os }}
+  mex:
     strategy:
+      fail-fast: false
       matrix:
-        os: [ubuntu-22.04, windows-latest, macos-latest]
+        include:
+          - os: ubuntu-22.04
+          - os: windows-2022
+#         - os: macos-13 this requires matlab openmp to work.
+          - os: macos-14
+          - os: macos-15
+    name: ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
+
     steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-      - name: Set up MATLAB
-        uses: matlab-actions/setup-matlab@v2
+      - uses: actions/checkout@v4
+
+      - name: Setup C++ toolchain
+        uses: aminya/setup-cpp@v1.7.1
         with:
-          release: R2023b
-          cache: true
-      - name: Generate Windows mex file
-        if: runner.os == 'Windows'
-        run: |
-          export PATH=/d/hostedtoolcache/windows/MATLAB/2023.2.999/x64/bin/win64:$PATH
-          export PATH=/c/msys64/usr/bin:$PATH
-          export PATH=/c/msys64/mingw64/bin:$PATH
-          export MW_MINGW64_LOC=/c/msys64/mingw64
-          pacman -Sy --noconfirm make mingw-w64-x86_64-toolchain mingw-w64-x86_64-fftw
-          mex -setup:.github/workflows/mex_C++_win64.xml C++
-          cp make-platforms/make.inc.windows_msys make.inc
-          sed -i '/  LIBSFFT/a \ \ LIBSFFT := `g++ --print-file-name libfftw3.a` `g++ --print-file-name libfftw3f.a` `g++ --print-file-name libfftw3_omp.a` `g++ --print-file-name libfftw3f_omp.a` `g++ --print-file-name libm.a` `g++ --print-file-name libgomp.a`' makefile
-          /c/msys64/usr/bin/make matlab
-        shell: C:\msys64\usr\bin\bash.exe {0}
-      - name: Generate macOS mex file
+          cmake: true
+          ninja: true
+          vcvarsall: true
+      - name: Set macOS deployment target
         if: runner.os == 'macOS'
+        shell: bash
         run: |
           brew install fftw libomp
-          sed 's/\/Applications\/MATLAB_R20\*\*.app/\/Users\/runner\/hostedtoolcache\/MATLAB\/2023.2.999\/arm64\/MATLAB.app/' make-platforms/make.inc.macosx_arm64 > make.inc
-          sed -i -e 's/  LIBSFFT.*/\ \ LIBSFFT := \/opt\/homebrew\/opt\/fftw\/lib\/libfftw3\.a \/opt\/homebrew\/opt\/fftw\/lib\/libfftw3f\.a \/opt\/homebrew\/opt\/fftw\/lib\/libfftw3_omp.a \/opt\/homebrew\/opt\/fftw\/lib\/libfftw3f_omp\.a -L\/Users\/runner\/hostedtoolcache\/MATLAB\/2023.2.999\/arm64\/MATLAB.app\/bin\/maca64\/lib -lomp/' makefile
-          make matlab
-      - name: Generate Linux mex file
-        if: runner.os == 'Linux'
+          echo "LDFLAGS=-L$(brew --prefix libomp)/lib" >> $GITHUB_ENV
+          echo "CPPFLAGS=-I$(brew --prefix libomp)/include" >> $GITHUB_ENV
+          echo "LIBRARY_PATH=$(brew --prefix libomp)/lib" >> $GITHUB_ENV
+          echo "DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib" >> $GITHUB_ENV
+          echo "CMAKE_PREFIX_PATH=$(brew --prefix):$(brew --prefix libomp)" >> $GITHUB_ENV
+          ver="${{ matrix.os }}"; ver="${ver#macos-}"
+          echo "MACOSX_DEPLOYMENT_TARGET=${ver}.0" >> "$GITHUB_ENV"
+
+      # Install CUDA on non-macOS runners
+      - name: Install CUDA Toolkit (non-macOS)
+        if: runner.os != 'macOS'
+        uses: Jimver/cuda-toolkit@v0.2.24
+        with:
+          cuda: '12.4.0'
+          method: 'network'
+          linux-local-args: '["--toolkit"]'
+          log-file-suffix: '${{matrix.os}}-cuda-install.txt'
+          sub-packages: ${{runner.os == 'Linux' && '["nvcc", "cudart"]' || '["nvcc", "cudart", "cufft", "cufft_dev"]'}}
+          non-cuda-sub-packages: ${{runner.os == 'Linux' && '["libcufft","libcufft-dev"]' || '[]'}}
+          use-local-cache: 'false'
+          use-github-cache: 'false' #it is not working at the moment https://github.com/Jimver/cuda-toolkit/issues/390
+
+      - name: Set up MATLAB
+        uses: matlab-actions/setup-matlab@v2
+        with:
+          release: R2023a
+          products: Parallel_Computing_Toolbox
+          cache: false
+
+      - name: Configure & Build
+        shell: bash
         run: |
-          sudo apt install libfftw3-dev
-          sed -i '/  LIBSFFT/a \ \ LIBSFFT := `g++ --print-file-name libfftw3.a` `g++ --print-file-name libfftw3f.a` `g++ --print-file-name libfftw3_omp.a` `g++ --print-file-name libfftw3f_omp.a` -lm -lgomp' makefile
-          make matlab
-      - name: Run Matlab test
+          # Ensure MATLAB is on PATH if provided by setup-matlab
+          if [ -n "${MATLAB_ROOT:-}" ]; then export PATH="${MATLAB_ROOT}/bin:${PATH}"; fi
+
+          # Enable CUDA only on Linux
+          CUDA_FLAG=""
+          if [ "${RUNNER_OS}" == "Linux" ]; then
+            CUDA_FLAG="-DFINUFFT_USE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=50;60;70;80;89"
+            # Optional: show nvcc version for debugging
+            if command -v nvcc >/dev/null 2>&1; then nvcc --version || true; fi
+          fi
+
+          cmake -S . -B build -G Ninja \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DFINUFFT_USE_DUCC0=ON \
+            -DFINUFFT_USE_OPENMP=ON \
+            -DFINUFFT_MATLAB_INSTALL=ON \
+            ${CUDA_FLAG}
+
+          cmake --build build --target finufft_mex --parallel
+
+          if [ "${RUNNER_OS}" == "Linux" ]; then
+            cmake --build build --target cufinufft_mex --parallel
+          fi
+
+      - name: Run fullmathtest.m
         uses: matlab-actions/run-command@v2
         with:
-          command: addpath(genpath('matlab')), check_finufft
-      - name: Upload mex files
-        uses: actions/upload-artifact@v4
+          command: |
+            addpath(genpath('matlab'));
+            addpath(genpath('build'));
+            fullmathtest
+
+      - name: Package with CPack
+        shell: bash
+        run: cmake --build build --target package
+
+      - uses: actions/upload-artifact@v4
         with:
-          name: ${{matrix.os}}-R2023b-finufft-mex
-          path: ${{runner.workspace}}/finufft/matlab/finufft.mex*
-      - name: Setup tmate session
-        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
+          name: finufft-matlab-mex-${{ matrix.os }}
+          path: |
+            build/*.zip
+            build/*.tar.gz
+          if-no-files-found: error
diff --git a/.github/workflows/powerpc.yml b/.github/workflows/powerpc.yml
new file mode 100644
index 000000000..31bf9a74f
--- /dev/null
+++ b/.github/workflows/powerpc.yml
@@ -0,0 +1,78 @@
+name: PowerPC cross-compilation build
+
+on:
+  # Weekly schedule (Sunday 00:00 UTC)
+  schedule:
+    - cron: "0 0 * * 0"
+
+  # On tag push
+  push:
+    tags:
+      - 'v*'
+  # Manual trigger (with an optional deploy toggle)
+  workflow_dispatch:
+    inputs:
+      deploy:
+        description: "Deploy after build?"
+        type: boolean
+        required: false
+        default: false
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    name: '${{ matrix.target.arch }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}'
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - { platform: 'ppc64le', arch: 'ppc64le', dir: 'powerpc64le-linux-gnu', flags: '-maltivec -mvsx -mcpu=power10' }
+          - { platform: 'ppc64',   arch: 'ppc64',   dir: 'powerpc64-linux-gnu',   flags: '-maltivec -mvsx -mcpu=power10' }
+        sys:
+          - { compiler: 'gcc', version: '12' }
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Apt setup (cross toolchains, qemu, ninja)
+        run: |
+          sudo apt-get update
+          sudo apt-get --no-install-suggests --no-install-recommends install \
+            g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} \
+            g++-${{ matrix.sys.version }}-multilib \
+            qemu-user \
+            ninja-build
+          sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true
+          sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true
+          sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20
+          sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20
+
+      - name: Configure (RelWithDebInfo, DUCC0 + examples + tests, no Fortran)
+        shell: bash
+        env:
+          CC:  ${{ matrix.target.dir }}-gcc
+          CXX: ${{ matrix.target.dir }}-g++
+        run: |
+          cmake -S . -B build -G Ninja \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DCMAKE_C_COMPILER="${CC}" \
+            -DCMAKE_CXX_COMPILER="${CXX}" \
+            -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" \
+            -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" \
+            -DCMAKE_SYSTEM_NAME=Linux \
+            -DCMAKE_SYSTEM_PROCESSOR=${{ matrix.target.arch }} \
+            -DCMAKE_CROSSCOMPILING_EMULATOR="qemu-${{ matrix.target.platform }};-cpu;power10;-L;/usr/${{ matrix.target.dir }}/" \
+            -DFINUFFT_USE_DUCC0=ON \
+            -DFINUFFT_BUILD_EXAMPLES=ON \
+            -DFINUFFT_BUILD_TESTS=ON \
+            -DFINUFFT_ARCH_FLAGS=
+
+      - name: Build
+        shell: bash
+        run: cmake --build build --config RelWithDebInfo --parallel
+
+      - name: Test
+        shell: bash
+        # With CMAKE_CROSSCOMPILING_EMULATOR set, tests run via QEMU automatically.
+        run: ctest --test-dir build --output-on-failure
diff --git a/.github/workflows/valgrind.yml b/.github/workflows/valgrind.yml
index 2323e9e06..bb385ff48 100644
--- a/.github/workflows/valgrind.yml
+++ b/.github/workflows/valgrind.yml
@@ -5,8 +5,6 @@ on: [push, pull_request]
 jobs:
   valgrind:
     runs-on: ubuntu-22.04
-    strategy:
-      fail-fast: false
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -25,7 +23,7 @@ jobs:
           sudo apt install -y libfftw3-dev jq valgrind
       - name: Configure Cmake
         run: |
-          cmake -S . -B ./build -G Ninja -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo -DFINUFFT_BUILD_TESTS=ON -DFINUFFT_ENABLE_SANITIZERS=OFF
+          cmake -S . -B ./build -G Ninja -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo -DFINUFFT_BUILD_TESTS=ON -DFINUFFT_USE_SANITIZERS=OFF
       - name: Build
         run: |
           cmake --build ./build --config RelWithDebInfo
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b285b1b3c..fb109435f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,27 +1,32 @@
-cmake_minimum_required(VERSION 3.19...3.31)
+cmake_minimum_required(VERSION 3.24...3.31)
+cmake_policy(VERSION 3.24)
 project(FINUFFT VERSION 2.5.0 LANGUAGES C CXX)
 
-# windows MSVC runtime flags policy
-cmake_policy(SET CMP0091 NEW)
-
 include(CMakeDependentOption)
 
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Set the default build type to Release" FORCE)
+endif()
+
 # gersemi: off
 # All options go here sphinx tag (don't remove): @cmake_opts_start
+option(FINUFFT_BUILD_DEVEL "Whether to build development executables" OFF)
+option(FINUFFT_BUILD_DOCS "Whether to build the FINUFFT documentation" OFF)
+option(FINUFFT_BUILD_EXAMPLES "Whether to build the FINUFFT examples" OFF)
 option(FINUFFT_BUILD_FORTRAN "Whether to build the FINUFFT Fortran examples" OFF)
 option(FINUFFT_BUILD_MATLAB "Whether to build the FINUFFT Matlab interface" OFF)
 option(FINUFFT_BUILD_PYTHON "Whether the Python wrapper should be built." OFF)
-option(FINUFFT_ENABLE_SANITIZERS "Whether to enable sanitizers, only effective for Debug configuration." OFF)
-option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disabled, the finufft library will be single threaded. This does not affect the choice of FFTW library." ON)
+option(FINUFFT_BUILD_TESTS "Whether to build the FINUFFT tests" OFF)
+option(FINUFFT_INTERPROCEDURAL_OPTIMIZATION "Enable interprocedural optimization (LTO) if supported" OFF)
+option(FINUFFT_POSITION_INDEPENDENT_CODE "Whether to build the finufft library with position independent code (-fPIC). This forced ON when FINUFFT_SHARED_LINKING is ON." ON)
+option(FINUFFT_STATIC_ANALYSIS "Run clang-tidy/cppcheck" OFF)
+option(FINUFFT_STATIC_LINKING "If ON builds the static finufft library, if OFF build a shared finufft library." ON)
 option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON)
 option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF)
-option(FINUFFT_STATIC_LINKING "If ON builds the static finufft library, if OFF build a shared finufft library." ON)
-option(FINUFFT_POSITION_INDEPENDENT_CODE "Whether to build the finufft library with position independent code (-fPIC). This forced ON when FINUFFT_SHARED_LINKING is ON." ON)
-option(FINUFFT_BUILD_DEVEL "Whether to build development executables" OFF)
-option(FINUFFT_BUILD_EXAMPLES "Whether to build the FINUFFT examples" OFF)
-option(FINUFFT_BUILD_TESTS "Whether to build the FINUFFT tests" OFF)
 option(FINUFFT_USE_DUCC0 "Whether to use DUCC0 (instead of FFTW) for CPU FFTs" OFF)
-option(FINUFFT_BUILD_DOCS "Whether to build the FINUFFT documentation" OFF)
+option(FINUFFT_USE_IWYU "Set CXX_INCLUDE_WHAT_YOU_USE on target (checker-only)" OFF)
+option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disabled, the finufft library will be single threaded. This does not affect the choice of FFTW library." ON)
+option(FINUFFT_USE_SANITIZERS "Whether to enable sanitizers, only effective for Debug configuration." OFF)
 # if FINUFFT_USE_DUCC0 is ON, the following options are ignored
 set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library")
 set(FINUFFT_FFTW_SUFFIX "DEFAULT" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.) defaults to empty string if OpenMP is disabled, else uses OpenMP. Ignored if DUCC0 is used.")
@@ -31,284 +36,55 @@ set(FINUFFT_ARCH_FLAGS "native" CACHE STRING "Compiler flags for specifying targ
 cmake_dependent_option(FINUFFT_ENABLE_INSTALL "Disable installation in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF)
 cmake_dependent_option(FINUFFT_STATIC_LINKING "Disable static libraries in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF)
 cmake_dependent_option(FINUFFT_SHARED_LINKING "Shared should be the opposite of static linking" ON "NOT FINUFFT_STATIC_LINKING" OFF)
+cmake_dependent_option(FINUFFT_INTERPROCEDURAL_OPTIMIZATION "LTO should be the opposite of static linking" ON "NOT FINUFFT_STATIC_LINKING" OFF)
+# internal development options
+option(FINUFFT_IWYU_VERBOSE "Verbose IWYU tool output" ON)
 # gersemi: on
 
+# Use the folder that contains this CMakeLists.txt, not the project root
+list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
+
 # When building shared libraries, we need to build with -fPIC in all cases
 if(FINUFFT_SHARED_LINKING)
     set(FINUFFT_POSITION_INDEPENDENT_CODE ON)
 endif()
 
-include(cmake/utils.cmake)
-
-set(FINUFFT_CXX_FLAGS_RELEASE
-    -funroll-loops
-    -ffp-contract=fast
-    -fno-math-errno
-    -fno-signed-zeros
-    -fno-trapping-math
-    -fassociative-math
-    -freciprocal-math
-    -fmerge-all-constants
-    -ftree-vectorize
-    -fimplicit-constexpr
-    -fcx-limited-range
-    -O3
-    /Ox
-    /fp:contract
-    /fp:except-
-    /GF
-    /GY
-    /GS-
-    /Ob
-    /Oi
-    /Ot
-    /Oy
-)
-
-filter_supported_compiler_flags(FINUFFT_CXX_FLAGS_RELEASE FINUFFT_CXX_FLAGS_RELEASE)
-message(STATUS "FINUFFT Release flags: ${FINUFFT_CXX_FLAGS_RELEASE}")
-set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO ${FINUFFT_CXX_FLAGS_RELEASE})
-
-set(FINUFFT_CXX_FLAGS_DEBUG
-    -g
-    -g3
-    -ggdb
-    -ggdb3
-    -Wall
-    -Wextra
-    -Wpedantic
-    -Wno-unknown-pragmas
-)
-
-if(DEFINED ENV{GITHUB_ACTIONS})
-    if($ENV{GITHUB_ACTIONS} STREQUAL "true")
-        message("CMake is being executed inside a GitHub workflow")
-        # if msvc use FS flag
-        if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT CMAKE_BUILD_TYPE STREQUAL "Release")
-            set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
-            message("CMAKE_MSVC_DEBUG_INFORMATION_FORMAT TO Embedded")
-        endif()
-    endif()
-else()
-    message("CMake is NOT being executed inside a GitHub workflow")
-    # env variable is:
-    message(STATUS "ENV{GITHUB_ACTIONS}: $ENV{GITHUB_ACTIONS}")
-endif()
-filter_supported_compiler_flags(FINUFFT_CXX_FLAGS_DEBUG FINUFFT_CXX_FLAGS_DEBUG)
-message(STATUS "FINUFFT Debug flags: ${FINUFFT_CXX_FLAGS_DEBUG}")
-list(APPEND FINUFFT_CXX_FLAGS_RELWITHDEBINFO ${FINUFFT_CXX_FLAGS_RELEASE} ${FINUFFT_CXX_FLAGS_DEBUG})
-message(STATUS "FINUFFT RelWithDebInfo flags: ${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}")
-
-if(FINUFFT_ARCH_FLAGS STREQUAL "native")
-    set(FINUFFT_ARCH_FLAGS -march=native CACHE STRING "" FORCE)
-    filter_supported_compiler_flags(FINUFFT_ARCH_FLAGS FINUFFT_ARCH_FLAGS)
-    if(NOT FINUFFT_ARCH_FLAGS)
-        set(FINUFFT_ARCH_FLAGS -mtune=native CACHE STRING "" FORCE)
-        filter_supported_compiler_flags(FINUFFT_ARCH_FLAGS FINUFFT_ARCH_FLAGS)
-    endif()
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-        # -march=native emulation for MSVC
-        check_arch_support()
-    endif()
-    if(NOT FINUFFT_ARCH_FLAGS)
-        message(WARNING "No architecture flags are supported by the compiler.")
-    else()
-        message(STATUS "FINUFFT Arch flags: ${FINUFFT_ARCH_FLAGS}")
-    endif()
-endif()
-
-# Set default build type to Release
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Set the default build type to Release" FORCE)
+if(FINUFFT_MATLAB_INSTALL)
+    set(FINUFFT_BUILD_MATLAB ON)
+    set(FINUFFT_ENABLE_INSTALL OFF)
 endif()
 
-# This set of sources is compiled twice, once in single precision and once in
-# double precision The single precision compilation is done with -DSINGLE
-set(FINUFFT_PRECISION_DEPENDENT_SOURCES)
-
-# If we're building for Fortran, make sure we also include the translation
-# layer.
-if(FINUFFT_BUILD_FORTRAN)
-    list(APPEND FINUFFT_PRECISION_DEPENDENT_SOURCES fortran/finufftfort.cpp)
-endif()
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-# set linker flags for sanitizer
-set(FINUFFT_SANITIZER_FLAGS)
-if(FINUFFT_ENABLE_SANITIZERS)
-    set(FINUFFT_SANITIZER_FLAGS
-        -fsanitize=address
-        -fsanitize=undefined
-        -fsanitize=bounds-strict
-        /fsanitize=address
-        /RTC1
-    )
-    filter_supported_compiler_flags(FINUFFT_SANITIZER_FLAGS FINUFFT_SANITIZER_FLAGS)
-    set(FINUFFT_SANITIZER_FLAGS $<$<CONFIG:Debug,RelWithDebInfo>:${FINUFFT_SANITIZER_FLAGS}>)
-endif()
-# Utility function to enable ASAN on debug builds
-function(enable_asan target)
-    target_compile_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS})
-    if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"))
-        target_link_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS})
-    endif()
-endfunction()
+include(utils) # finds cmake/utils.cmake
+include(toolchain) # finds cmake/toolchain.cmake
 
-set(CPM_DOWNLOAD_VERSION "0.40.5" CACHE STRING "Version of CPM.cmake to use")
+# Versions for dependencies
+set(CPM_DOWNLOAD_VERSION "0.42.0" CACHE STRING "Version of CPM.cmake to use")
 set(FFTW_VERSION "3.3.10" CACHE STRING "Version of FFTW to use")
-set(XTL_VERSION "0.7.7" CACHE STRING "Version of xtl to use")
 set(XSIMD_VERSION "13.2.0" CACHE STRING "Version of xsimd to use")
-set(DUCC0_VERSION "ducc0_0_36_0" CACHE STRING "Version of ducc0 to use")
+set(DUCC0_VERSION "ducc0_0_38_0" CACHE STRING "Version of ducc0 to use")
 set(CUDA11_CCCL_VERSION "2.8.5" CACHE STRING "Version of FINUFFT-cccl for cuda 11 to use")
-set(CUDA12_CCCL_VERSION "3.0.1" CACHE STRING "Version of FINUFFT-cccl for cuda 12 to use")
-mark_as_advanced(CPM_DOWNLOAD_VERSION)
-mark_as_advanced(FFTW_VERSION)
-mark_as_advanced(XTL_VERSION)
-mark_as_advanced(XSIMD_VERSION)
-mark_as_advanced(DUCC0_VERSION)
-mark_as_advanced(CUDA11_CCCL_VERSION)
-mark_as_advanced(CUDA12_CCCL_VERSION)
-
-include(cmake/setupCPM.cmake)
-
-if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
-    include(CTest)
-    if(FINUFFT_BUILD_TESTS)
-        enable_testing()
-    endif()
-    if(FINUFFT_BUILD_DOCS)
-        include(cmake/setupSphinx.cmake)
-    endif()
-endif()
-
-if(FINUFFT_USE_CPU)
-    # make apple with gnu use old linker, new linker breaks, see issue #360
-    if((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
-        add_link_options("-ld_classic")
-    endif()
-    set(FINUFFT_FFTW_LIBRARIES)
-    include(cmake/setupXSIMD.cmake)
-    if(FINUFFT_USE_DUCC0)
-        include(cmake/setupDUCC.cmake)
-    else()
-        include(cmake/setupFFTW.cmake)
-    endif()
-    if(FINUFFT_USE_DUCC0)
-        set(FINUFFT_FFTLIBS ducc0)
-    else()
-        set(FINUFFT_FFTLIBS ${FINUFFT_FFTW_LIBRARIES})
-    endif()
-    if(FINUFFT_USE_OPENMP)
-        find_package(OpenMP COMPONENTS C CXX REQUIRED)
-    endif()
-endif()
-# check if -Wno-deprecated-declarations is supported
-check_cxx_compiler_flag(-Wno-deprecated-declarations FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
-
-# Utility function to link static/dynamic lib
-function(finufft_link_test target)
-    if(FINUFFT_USE_DUCC0)
-        target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
-    endif()
-    target_link_libraries(${target} PRIVATE finufft xsimd ${FINUFFT_FFTLIBS})
-    if(FINUFFT_USE_OPENMP)
-        target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
-        target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS})
-    endif()
-    enable_asan(${target})
-    target_compile_features(${target} PRIVATE cxx_std_17)
-    set_target_properties(
-        ${target}
-        PROPERTIES
-            MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
-            POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
-    )
-    # disable deprecated warnings for tests if supported
-    if(FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
-        target_compile_options(${target} PRIVATE -Wno-deprecated-declarations)
-    endif()
-endfunction()
+set(CUDA12_CCCL_VERSION "3.0.2" CACHE STRING "Version of FINUFFT-cccl for cuda 12 to use")
+
+mark_as_advanced(
+    CPM_DOWNLOAD_VERSION
+    FFTW_VERSION
+    XTL_VERSION
+    XSIMD_VERSION
+    DUCC0_VERSION
+    CUDA11_CCCL_VERSION
+    CUDA12_CCCL_VERSION
+)
 
-# Utility function to set finufft compilation options.
-function(set_finufft_options target)
-    target_compile_features(${target} PRIVATE cxx_std_17)
-    target_compile_options(${target} PRIVATE $<$<CONFIG:Release,RelWithDebInfo>:${FINUFFT_ARCH_FLAGS}>)
-    target_compile_options(${target} PRIVATE $<$<CONFIG:Release>:${FINUFFT_CXX_FLAGS_RELEASE}>)
-    target_compile_options(${target} PRIVATE $<$<CONFIG:RelWithDebInfo>:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>)
-    target_compile_options(${target} PRIVATE $<$<CONFIG:Debug>:${FINUFFT_CXX_FLAGS_DEBUG}>)
-    target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
-    target_include_directories(${target} SYSTEM INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>)
-    set_target_properties(
-        ${target}
-        PROPERTIES
-            MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
-            POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
-    )
-    enable_asan(${target})
-    if(FINUFFT_USE_OPENMP)
-        target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
-        target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS})
-    endif()
-    if(FINUFFT_USE_DUCC0)
-        target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
-    endif()
-    target_link_libraries(${target} PRIVATE xsimd)
-    target_link_libraries(${target} PRIVATE ${FINUFFT_FFTLIBS})
-endfunction()
+include(setupCPM)
 
 if(FINUFFT_USE_CPU)
-    set(FINUFFT_SOURCES
-        src/spreadinterp.cpp
-        src/fft.cpp
-        src/finufft_core.cpp
-        src/c_interface.cpp
-        src/finufft_utils.cpp
-    )
-
-    if(FINUFFT_BUILD_FORTRAN)
-        list(APPEND FINUFFT_SOURCES fortran/finufftfort.cpp)
-    endif()
-
-    # Main finufft libraries
-    if(NOT FINUFFT_STATIC_LINKING)
-        add_library(finufft SHARED ${FINUFFT_SOURCES})
-    else()
-        add_library(finufft STATIC ${FINUFFT_SOURCES})
-    endif()
-    set_finufft_options(finufft)
-
-    if(WIN32 AND FINUFFT_SHARED_LINKING)
-        target_compile_definitions(finufft PRIVATE dll_EXPORTS FINUFFT_DLL)
-    endif()
-    find_library(MATH_LIBRARY m)
-    if(MATH_LIBRARY)
-        target_link_libraries(finufft PRIVATE ${MATH_LIBRARY})
-    endif()
-    target_include_directories(finufft PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
-    target_include_directories(finufft SYSTEM INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>)
-    if(FINUFFT_ENABLE_INSTALL)
-        file(GLOB FINUFFT_PUBLIC_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/include/finufft*.h")
-        set_target_properties(finufft PROPERTIES PUBLIC_HEADER "${FINUFFT_PUBLIC_HEADERS}")
-    endif()
-    list(APPEND INSTALL_TARGETS finufft)
+    add_subdirectory(src)
 endif()
 
 if(FINUFFT_USE_CUDA)
-    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        message(
-            WARNING
-            "FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to 'native' See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply."
-        )
-        detect_cuda_architecture()
-    endif()
-    if(CMAKE_CUDA_ARCHITECTURES MATCHES "compute_")
-        message(
-            FATAL_ERROR
-            "CMAKE_CUDA_ARCHITECTURES must be a list of integers like 70;75;86, not strings like compute_89"
-        )
-    endif()
-    enable_language(CUDA)
-    find_package(CUDAToolkit REQUIRED)
-    include(cmake/setupCCCL.cmake)
+    include(cuda_setup)
     add_subdirectory(src/cuda)
     if(BUILD_TESTING AND FINUFFT_BUILD_TESTS)
         add_subdirectory(perftest/cuda)
@@ -363,7 +139,7 @@ message(STATUS "  FINUFFT_BUILD_TESTS: ${FINUFFT_BUILD_TESTS}")
 message(STATUS "  FINUFFT_BUILD_FORTRAN: ${FINUFFT_BUILD_FORTRAN}")
 message(STATUS "  FINUFFT_BUILD_MATLAB: ${FINUFFT_BUILD_MATLAB}")
 message(STATUS "  FINUFFT_BUILD_PYTHON: ${FINUFFT_BUILD_PYTHON}")
-message(STATUS "  FINUFFT_ENABLE_SANITIZERS: ${FINUFFT_ENABLE_SANITIZERS}")
+message(STATUS "  FINUFFT_USE_SANITIZERS: ${FINUFFT_USE_SANITIZERS}")
 message(STATUS "  FINUFFT_FFTW_SUFFIX: ${FINUFFT_FFTW_SUFFIX}")
 message(STATUS "  FINUFFT_FFTW_LIBRARIES: ${FINUFFT_FFTW_LIBRARIES}")
 message(STATUS "  FINUFFT_ARCH_FLAGS: ${FINUFFT_ARCH_FLAGS}")
@@ -373,7 +149,23 @@ message(STATUS "  CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")
 
 if(FINUFFT_ENABLE_INSTALL)
     include(GNUInstallDirs)
-    install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER)
+    include(CMakePackageConfigHelpers)
+    install(TARGETS ${INSTALL_TARGETS} EXPORT finufftTargets PUBLIC_HEADER)
+    install(EXPORT finufftTargets NAMESPACE finufft:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/finufft)
+    configure_package_config_file(
+        cmake/finufftConfig.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/finufftConfig.cmake
+        INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/finufft
+    )
+    write_basic_package_version_file(
+        ${CMAKE_CURRENT_BINARY_DIR}/finufftConfigVersion.cmake
+        VERSION ${PROJECT_VERSION}
+        COMPATIBILITY SameMajorVersion
+    )
+    install(
+        FILES ${CMAKE_CURRENT_BINARY_DIR}/finufftConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/finufftConfigVersion.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/finufft
+    )
     install(FILES ${PROJECT_SOURCE_DIR}/LICENSE DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/licenses/finufft)
     if(FINUFFT_USE_CPU)
         install(
diff --git a/cmake/cuda_setup.cmake b/cmake/cuda_setup.cmake
new file mode 100644
index 000000000..44c834554
--- /dev/null
+++ b/cmake/cuda_setup.cmake
@@ -0,0 +1,55 @@
+# cmake/cuda_setup
+
+include_guard(GLOBAL)
+
+function(detect_cuda_architecture)
+    find_program(NVIDIA_SMI_EXECUTABLE nvidia-smi)
+    if(NVIDIA_SMI_EXECUTABLE)
+        execute_process(
+            COMMAND ${NVIDIA_SMI_EXECUTABLE} --query-gpu=compute_cap --format=csv,noheader
+            OUTPUT_VARIABLE compute_caps_raw
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_QUIET
+        )
+        if(compute_caps_raw)
+            string(REPLACE "\n" ";" compute_caps "${compute_caps_raw}")
+            set(max_arch_num 0)
+            foreach(cc ${compute_caps})
+                string(STRIP "${cc}" cc_s)
+                if(cc_s MATCHES "^[0-9]+\\.[0-9]+$")
+                    string(REPLACE "." "" arch_digit "${cc_s}") # 8.9 -> 89
+                    if(arch_digit GREATER max_arch_num)
+                        set(max_arch_num ${arch_digit})
+                    endif()
+                endif()
+            endforeach()
+            if(max_arch_num GREATER 0)
+                message(STATUS "Detected CUDA compute capability: sm_${max_arch_num}")
+                # Write to CACHE so it "sticks" for this configure and future runs
+                set(CMAKE_CUDA_ARCHITECTURES "${max_arch_num}" CACHE STRING "CUDA SMs" FORCE)
+                return()
+            endif()
+        endif()
+        message(
+            WARNING
+            "Could not parse compute capability from nvidia-smi output '${compute_caps_raw}'. Using 'native'."
+        )
+        set(CMAKE_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA SMs" FORCE)
+    else()
+        message(WARNING "nvidia-smi not found. Using 'native'.")
+        set(CMAKE_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA SMs" FORCE)
+    endif()
+endfunction()
+
+# Only detect if user didn't supply it on the command line/preset
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "")
+    detect_cuda_architecture()
+else()
+    message(STATUS "Using user-specified CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+endif()
+
+# Now it's safe to enable CUDA / find the toolkit
+enable_language(CUDA)
+find_package(CUDAToolkit REQUIRED)
+
+include(setupCCCL)
diff --git a/cmake/finufftConfig.cmake.in b/cmake/finufftConfig.cmake.in
new file mode 100644
index 000000000..6de5e682c
--- /dev/null
+++ b/cmake/finufftConfig.cmake.in
@@ -0,0 +1,5 @@
+@PACKAGE_INIT@
+
+include("${CMAKE_CURRENT_LIST_DIR}/finufftTargets.cmake")
+
+check_required_components(finufft)
diff --git a/cmake/setupDUCC.cmake b/cmake/setupDUCC.cmake
index 89c3a093c..2057db580 100644
--- a/cmake/setupDUCC.cmake
+++ b/cmake/setupDUCC.cmake
@@ -25,12 +25,7 @@ if(ducc0_ADDED)
     target_compile_options(ducc0 PRIVATE $<$<CONFIG:RelWithDebInfo>:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>)
     target_compile_features(ducc0 PRIVATE cxx_std_17)
     # private because we do not want to propagate this requirement
-    set_target_properties(
-        ducc0
-        PROPERTIES
-            MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
-            POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
-    )
+    set_target_properties(ducc0 PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE})
     check_cxx_compiler_flag(-ffast-math HAS_FAST_MATH)
     if(HAS_FAST_MATH)
         target_compile_options(ducc0 PRIVATE -ffast-math)
diff --git a/cmake/setupFFTW.cmake b/cmake/setupFFTW.cmake
index e6eb36619..8fe95e8ae 100644
--- a/cmake/setupFFTW.cmake
+++ b/cmake/setupFFTW.cmake
@@ -12,6 +12,7 @@ CPMAddPackage(
 )
 
 list(APPEND CMAKE_MODULE_PATH "${findfftw_SOURCE_DIR}")
+set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" PARENT_SCOPE)
 
 if(FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD)
     find_package(FFTW)
@@ -24,6 +25,10 @@ if(FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL DO
         CPMAddPackage(
             NAME
             fftw3
+            URL
+            "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz"
+            URL_HASH
+            "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c"
             OPTIONS
             "ENABLE_SSE2 ON"
             "ENABLE_AVX ON"
@@ -32,19 +37,16 @@ if(FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL DO
             "BUILD_SHARED_LIBS OFF"
             "ENABLE_THREADS ${FINUFFT_USE_THREADS}"
             "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}"
-            URL
-            "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz"
-            URL_HASH
-            "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c"
-            EXCLUDE_FROM_ALL
-            YES
-            GIT_SHALLOW
-            YES
+            "CMAKE_POLICY_VERSION_MINIMUM 3.10"
         )
 
         CPMAddPackage(
             NAME
             fftw3f
+            URL
+            "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz"
+            URL_HASH
+            "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c"
             OPTIONS
             "ENABLE_SSE2 ON"
             "ENABLE_AVX ON"
@@ -54,14 +56,7 @@ if(FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL DO
             "BUILD_SHARED_LIBS OFF"
             "ENABLE_THREADS ${FINUFFT_USE_THREADS}"
             "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}"
-            URL
-            "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz"
-            URL_HASH
-            "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c"
-            EXCLUDE_FROM_ALL
-            YES
-            GIT_SHALLOW
-            YES
+            "CMAKE_POLICY_VERSION_MINIMUM 3.10"
         )
         set(FINUFFT_FFTW_LIBRARIES fftw3 fftw3f)
         if(FINUFFT_USE_THREADS)
@@ -74,8 +69,8 @@ if(FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL DO
             set_target_properties(
                 ${element}
                 PROPERTIES
-                    MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
                     POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
+                    CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded
             )
         endforeach()
 
diff --git a/cmake/setupIWYU.cmake b/cmake/setupIWYU.cmake
new file mode 100644
index 000000000..6f5c9de3b
--- /dev/null
+++ b/cmake/setupIWYU.cmake
@@ -0,0 +1,391 @@
+# IWYU integration helpers
+# - Adds a custom target `iwyu-<tgt>` that:
+#   1) Runs iwyu_tool.py over <tgt>'s sources using compile_commands.json
+#   2) Optionally pipes into fix_includes.py to auto-apply fixes
+#   3) Applies fixes ONLY under src/, include/, test/ (default)
+#
+# Extras:
+# - IWYU_EXCLUDE_FILES: skip certain files entirely (analysis + auto-fix)
+#   (Default excludes: src/fft.cpp and include/finufft/fft.h)
+# - IWYU_MAPPING_FILES: semicolon-separated .imp files (auto-added if they exist)
+#
+# Usage in CMakeLists.txt:
+#   add_iwyu_fix_target(my_target)
+
+# ----------------------------------------------------------------------------- #
+# User-tunable cache variables
+# ----------------------------------------------------------------------------- #
+set(IWYU_ONLY_REGEX
+    "^${PROJECT_SOURCE_DIR}/(src|include|test|examples)/.*"
+    CACHE STRING
+    "Regex of files to APPLY FIXES to (fix_includes.py --only_re)."
+)
+
+set(IWYU_IGNORE_REGEX
+    "(external/|third_party/|3rdparty/|vendor/|_deps/|build/|cmake%-build)"
+    CACHE STRING
+    "Regex of paths to ignore when applying fixes (fix_includes.py --ignore_re)."
+)
+
+set(IWYU_EXTRA_ARGS "" CACHE STRING "Extra arguments forwarded to include-what-you-use (space-separated).")
+
+# Files to exclude entirely from IWYU analysis AND auto-fix (semicolon-separated).
+# Paths may be absolute or relative to ${PROJECT_SOURCE_DIR}.
+set(IWYU_EXCLUDE_FILES
+    "src/fft.cpp;include/finufft/fft.h;src/finufft_utils.cpp"
+    CACHE STRING
+    "Files to exclude from IWYU analysis and auto-fix (semicolon-separated)."
+)
+
+# Optional: semicolon-separated list of .imp mapping files to load.
+# Example: "${PROJECT_SOURCE_DIR}/iwyu.imp"
+set(IWYU_MAPPING_FILES
+    "${PROJECT_SOURCE_DIR}/iwyu.imp"
+    CACHE STRING
+    "Semicolon-separated IWYU mapping files (.imp). Non-existent paths are ignored."
+)
+
+mark_as_advanced(
+    IWYU_ONLY_REGEX
+    IWYU_IGNORE_REGEX
+    IWYU_EXTRA_ARGS
+    IWYU_EXCLUDE_FILES
+    IWYU_MAPPING_FILES
+    IWYU_BIN
+    IWYU_TOOL
+    IWYU_FIX
+)
+
+# ----------------------------------------------------------------------------- #
+# Find tools (soft — never fail the configure step)
+# ----------------------------------------------------------------------------- #
+find_program(IWYU_BIN NAMES include-what-you-use iwyu)
+find_program(IWYU_TOOL NAMES iwyu_tool.py iwyu_tool)
+find_program(IWYU_FIX NAMES fix_includes.py fix_includes)
+find_package(Python3 COMPONENTS Interpreter) # no REQUIRED, soft fail with message
+
+# If scripts not found, try beside IWYU_BIN
+if(IWYU_BIN AND (NOT IWYU_TOOL OR NOT IWYU_FIX))
+    get_filename_component(_IWYU_DIR "${IWYU_BIN}" DIRECTORY)
+    if(NOT IWYU_TOOL AND EXISTS "${_IWYU_DIR}/iwyu_tool.py")
+        set(IWYU_TOOL "${_IWYU_DIR}/iwyu_tool.py")
+    endif()
+    if(NOT IWYU_FIX AND EXISTS "${_IWYU_DIR}/fix_includes.py")
+        set(IWYU_FIX "${_IWYU_DIR}/fix_includes.py")
+    endif()
+endif()
+
+# ----------------------------------------------------------------------------- #
+# Helpers
+# ----------------------------------------------------------------------------- #
+# Collect C/C++ sources for a target (skip headers and non-C/C++)
+function(_iwyu_collect_sources tgt out_var)
+    get_target_property(_srcs ${tgt} SOURCES)
+    set(_files "")
+    foreach(s IN LISTS _srcs)
+        get_source_file_property(_lang "${s}" LANGUAGE)
+        if(_lang STREQUAL "C" OR _lang STREQUAL "CXX")
+            list(APPEND _files "${s}")
+        elseif(NOT _lang OR _lang STREQUAL "NONE")
+            if(s MATCHES "\\.(c|cc|cp|cxx|cpp|c\\+\\+|C)$")
+                list(APPEND _files "${s}")
+            endif()
+        endif()
+    endforeach()
+    set(${out_var} "${_files}" PARENT_SCOPE)
+endfunction()
+
+# Detect whether fix_includes.py supports --quiet; sets IWYU_FIX_QUIET_ARG.
+function(_iwyu_detect_fix_quiet)
+    set(IWYU_FIX_QUIET_ARG "" PARENT_SCOPE)
+    if(NOT IWYU_FIX OR NOT Python3_Interpreter_FOUND OR NOT Python3_EXECUTABLE)
+        return()
+    endif()
+    execute_process(
+        COMMAND "${Python3_EXECUTABLE}" "${IWYU_FIX}" -h
+        OUTPUT_VARIABLE _fix_help_out
+        ERROR_VARIABLE _fix_help_err
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_STRIP_TRAILING_WHITESPACE
+    )
+    set(_help_text "${_fix_help_out}\n${_fix_help_err}")
+    if(_help_text MATCHES "--quiet")
+        set(IWYU_FIX_QUIET_ARG "--quiet" PARENT_SCOPE)
+    endif()
+endfunction()
+
+# Strong health check: try an actual (headerless) compile to trigger ABI issues.
+function(_iwyu_check_binary out_ok)
+    set(${out_ok} FALSE PARENT_SCOPE)
+    if(NOT IWYU_BIN)
+        return()
+    endif()
+
+    execute_process(COMMAND "${IWYU_BIN}" --help RESULT_VARIABLE _rc_help OUTPUT_QUIET ERROR_QUIET)
+
+    # Probe compile: C++ file with no headers, syntax-only.
+    set(_probe_dir "${CMAKE_BINARY_DIR}/_iwyu")
+    file(MAKE_DIRECTORY "${_probe_dir}")
+    set(_probe_src "${_probe_dir}/__iwyu_probe__.cc")
+    file(WRITE "${_probe_src}" "int main(){return 0;}")
+
+    execute_process(
+        COMMAND "${IWYU_BIN}" -fsyntax-only -x c++ "${_probe_src}"
+        RESULT_VARIABLE _rc_probe
+        OUTPUT_VARIABLE _o
+        ERROR_VARIABLE _e
+    )
+
+    if(_rc_probe EQUAL 0)
+        set(${out_ok} TRUE PARENT_SCOPE)
+    else()
+        string(REPLACE "\n" "\n    " _e_indented "${_e}")
+        message(STATUS "[IWYU] '${IWYU_BIN}' failed probe compile (rc=${_rc_probe}).\n    ${_e_indented}")
+        set(${out_ok} FALSE PARENT_SCOPE)
+    endif()
+endfunction()
+
+# Create a shim dir that provides 'include-what-you-use' which execs IWYU_BIN.
+# iwyu_tool.py looks up 'include-what-you-use' on PATH; this forces it to use ours.
+function(_iwyu_prepare_shim out_dir)
+    set(_shim_dir "${CMAKE_BINARY_DIR}/_iwyu/shim")
+    if(UNIX AND IWYU_BIN)
+        file(MAKE_DIRECTORY "${_shim_dir}")
+        set(_shim "${_shim_dir}/include-what-you-use")
+        if(NOT EXISTS "${_shim}")
+            file(WRITE "${_shim}" "#!/bin/sh\nexec \"${IWYU_BIN}\" \"$@\"\n")
+            execute_process(COMMAND /bin/chmod +x "${_shim}")
+        endif()
+        set(${out_dir} "${_shim_dir}" PARENT_SCOPE)
+    else()
+        if(IWYU_BIN)
+            get_filename_component(_d "${IWYU_BIN}" DIRECTORY)
+            set(${out_dir} "${_d}" PARENT_SCOPE)
+        else()
+            set(${out_dir} "" PARENT_SCOPE)
+        endif()
+    endif()
+endfunction()
+
+# ----------------------------------------------------------------------------- #
+# Public: add_iwyu_fix_target(<tgt>)
+# ----------------------------------------------------------------------------- #
+function(add_iwyu_fix_target tgt)
+    if(NOT TARGET ${tgt})
+        message(STATUS "[IWYU] IWYU not enabled: target '${tgt}' not found")
+        return()
+    endif()
+
+    get_target_property(_type ${tgt} TYPE)
+    if(_type STREQUAL "INTERFACE_LIBRARY")
+        message(STATUS "[IWYU] Skipping INTERFACE target '${tgt}'")
+        return()
+    endif()
+
+    # Ensure compile_commands.json exists (iwyu_tool.py requires it)
+    set(_CC_JSON "${CMAKE_BINARY_DIR}/compile_commands.json")
+    if(NOT EXISTS "${_CC_JSON}")
+        message(
+            STATUS
+            "[IWYU] IWYU not enabled: compile_commands.json not found (enable with -DCMAKE_EXPORT_COMPILE_COMMANDS=ON)"
+        )
+        return()
+    endif()
+
+    # Gather and filter sources
+    _iwyu_collect_sources(${tgt} _SRC_FILES)
+
+    # Normalize IWYU_EXCLUDE_FILES to absolute paths
+    set(_EXCL_ABS "")
+    if(IWYU_EXCLUDE_FILES)
+        foreach(ex IN LISTS IWYU_EXCLUDE_FILES)
+            if(IS_ABSOLUTE "${ex}")
+                list(APPEND _EXCL_ABS "${ex}")
+            else()
+                list(APPEND _EXCL_ABS "${PROJECT_SOURCE_DIR}/${ex}")
+            endif()
+        endforeach()
+    endif()
+
+    # Remove excluded files from analysis
+    if(_EXCL_ABS)
+        set(_SRC_FILES_ABS "")
+        foreach(f IN LISTS _SRC_FILES)
+            if(IS_ABSOLUTE "${f}")
+                list(APPEND _SRC_FILES_ABS "${f}")
+            else()
+                list(APPEND _SRC_FILES_ABS "${CMAKE_CURRENT_SOURCE_DIR}/${f}")
+            endif()
+        endforeach()
+        foreach(exabs IN LISTS _EXCL_ABS)
+            list(FILTER _SRC_FILES_ABS EXCLUDE REGEX "^${exabs}$")
+        endforeach()
+        set(_SRC_FILES "${_SRC_FILES_ABS}")
+    endif()
+
+    if(_SRC_FILES STREQUAL "")
+        message(STATUS "[IWYU] No C/C++ sources for '${tgt}', skipping")
+        return()
+    endif()
+
+    # Tools availability and health (degrade gracefully, never fail)
+    # We need Python for iwyu_tool.py and fix_includes.py.
+    if(NOT Python3_Interpreter_FOUND OR NOT Python3_EXECUTABLE)
+        message(STATUS "[IWYU] IWYU not enabled: Python3 interpreter not found")
+        return()
+    endif()
+
+    if(NOT IWYU_BIN)
+        message(STATUS "[IWYU] IWYU not enabled: include-what-you-use binary not found")
+        return()
+    endif()
+
+    if(NOT IWYU_TOOL)
+        message(STATUS "[IWYU] IWYU not enabled: iwyu_tool.py not found")
+        return()
+    endif()
+
+    # Health check the IWYU binary
+    _iwyu_check_binary(_ok)
+    if(NOT _ok)
+        message(STATUS "[IWYU] IWYU not enabled: probe compile failed")
+        return()
+    endif()
+
+    # We can still run analysis if fix_includes.py is missing.
+    set(_have_fix TRUE)
+    if(NOT IWYU_FIX)
+        set(_have_fix FALSE)
+        message(STATUS "[IWYU] fix_includes.py not found: will run analysis only (no auto-fix)")
+    endif()
+
+    # Normalize to absolute paths for iwyu_tool.py
+    set(_SRC_ARGS "")
+    foreach(f IN LISTS _SRC_FILES)
+        if(IS_ABSOLUTE "${f}")
+            list(APPEND _SRC_ARGS "${f}")
+        else()
+            list(APPEND _SRC_ARGS "${CMAKE_CURRENT_SOURCE_DIR}/${f}")
+        endif()
+    endforeach()
+
+    # Compose extra args (local copy so we can safely append)
+    set(_EXTRA_LOCAL "${IWYU_EXTRA_ARGS}")
+
+    # Add any mapping files that exist
+    if(IWYU_MAPPING_FILES)
+        foreach(_mf IN LISTS IWYU_MAPPING_FILES)
+            if(NOT IS_ABSOLUTE "${_mf}")
+                set(_mf "${PROJECT_SOURCE_DIR}/${_mf}")
+            endif()
+            if(EXISTS "${_mf}")
+                # Use -Xiwyu so flags reach IWYU proper
+                set(_EXTRA_LOCAL "${_EXTRA_LOCAL} -Xiwyu --mapping_file=${_mf}")
+            else()
+                message(STATUS "[IWYU] Mapping file not found (skipped): ${_mf}")
+            endif()
+        endforeach()
+    endif()
+
+    # Verbosity & split
+    set(_VERBOSE "")
+    if(FINUFFT_IWYU_VERBOSE)
+        set(_VERBOSE "-v")
+    endif()
+    separate_arguments(_EXTRA NATIVE_COMMAND "${_EXTRA_LOCAL}")
+    if(_EXTRA)
+        set(_EXTRA_PART " -- $<JOIN:${_EXTRA}, >")
+    else()
+        set(_EXTRA_PART "")
+    endif()
+
+    _iwyu_detect_fix_quiet()
+    set(_FIX_QUIET_ARG "${IWYU_FIX_QUIET_ARG}")
+
+    # Logging
+    set(_LOG_DIR "${CMAKE_BINARY_DIR}/_iwyu")
+    file(MAKE_DIRECTORY "${_LOG_DIR}")
+    set(_LOG_FILE "${_LOG_DIR}/iwyu-${tgt}.out")
+
+    # Ensure iwyu_tool.py resolves the correct IWYU via PATH
+    _iwyu_prepare_shim(_PATH_FRONT)
+
+    # Base command (force IWYU output format!)
+    set(_CMD1
+        "${Python3_EXECUTABLE} \"${IWYU_TOOL}\" -p \"${CMAKE_BINARY_DIR}\" -o iwyu $<JOIN:${_SRC_ARGS}, > ${_VERBOSE}${_EXTRA_PART}"
+    )
+
+    # Build an ignore regex that also blocks the excluded files from auto-fix
+    set(_IWYU_IGNORE_RX "${IWYU_IGNORE_REGEX}")
+    if(_EXCL_ABS)
+        foreach(ex IN LISTS _EXCL_ABS)
+            string(REPLACE "." "\\." _ex_esc "${ex}") # escape '.'
+            set(_IWYU_IGNORE_RX "${_IWYU_IGNORE_RX}|^${_ex_esc}$")
+        endforeach()
+    endif()
+
+    if(WIN32)
+        # Escape % for cmd.exe
+        set(_IGNORE_ESC "${_IWYU_IGNORE_RX}")
+        set(_ONLY_ESC "${IWYU_ONLY_REGEX}")
+        string(REPLACE "%" "%%" _IGNORE_ESC "${_IGNORE_ESC}")
+        string(REPLACE "%" "%%" _ONLY_ESC "${_ONLY_ESC}")
+
+        if(_have_fix)
+            set(_PIPE_CMD
+                "${_CMD1} 2>&1 > \"${_LOG_FILE}\" & type \"${_LOG_FILE}\" & (findstr /r \"should .* these lines\" \"${_LOG_FILE}\" >NUL) && (${Python3_EXECUTABLE} \"${IWYU_FIX}\" --nocomments ${_FIX_QUIET_ARG} --ignore_re=${_IGNORE_ESC} --only_re=${_ONLY_ESC} < \"${_LOG_FILE}\") || echo [IWYU] No actionable suggestions (see ${_LOG_FILE})"
+            )
+        else()
+            set(_PIPE_CMD "${_CMD1} 2>&1 > \"${_LOG_FILE}\" & type \"${_LOG_FILE}\"")
+        endif()
+        set(_SHELL cmd /C)
+        set(_ENV_PATH "PATH=${_PATH_FRONT};$ENV{PATH}")
+    else()
+        if(_have_fix)
+            set(_PIPE_CMD
+                "${_CMD1} 2>&1 | tee \"${_LOG_FILE}\"; if grep -E -q \"should (add|remove) these lines\" \"${_LOG_FILE}\"; then ${Python3_EXECUTABLE} \"${IWYU_FIX}\" --nocomments ${_FIX_QUIET_ARG} --ignore_re='${_IWYU_IGNORE_RX}' --only_re='${IWYU_ONLY_REGEX}' < \"${_LOG_FILE}\"; else echo \"[IWYU] No actionable suggestions (see ${_LOG_FILE})\"; fi"
+            )
+        else()
+            set(_PIPE_CMD "${_CMD1} 2>&1 | tee \"${_LOG_FILE}\"")
+        endif()
+        set(_SHELL /bin/sh -lc)
+        if(_PATH_FRONT)
+            set(_ENV_PATH "PATH=${_PATH_FRONT}:$ENV{PATH}")
+        else()
+            set(_ENV_PATH "PATH=$ENV{PATH}")
+        endif()
+    endif()
+
+    # Comment
+    set(_COMMENT "[IWYU] include-what-you-use")
+    if(_have_fix)
+        set(_COMMENT "${_COMMENT} + auto-fix")
+    endif()
+    set(_COMMENT "${_COMMENT} (only src/include/test) for '${tgt}'")
+
+    add_custom_target(
+        iwyu-${tgt}
+        COMMAND ${CMAKE_COMMAND} -E env "${_ENV_PATH}" ${_SHELL} "${_PIPE_CMD}"
+        WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+        USES_TERMINAL
+        VERBATIM
+        COMMENT "${_COMMENT}"
+    )
+
+    if(FINUFFT_ENABLE_IWYU AND IWYU_BIN)
+        set_property(TARGET ${tgt} PROPERTY CXX_INCLUDE_WHAT_YOU_USE "${IWYU_BIN};${IWYU_EXTRA_ARGS}")
+    endif()
+
+    message(STATUS "[IWYU] Added target: iwyu-${tgt}")
+endfunction()
+
+mark_as_advanced(
+    IWYU_ONLY_REGEX
+    IWYU_IGNORE_REGEX
+    IWYU_EXTRA_ARGS
+    IWYU_EXCLUDE_FILES
+    IWYU_MAPPING_FILES
+    IWYU_BIN
+    IWYU_TOOL
+    IWYU_FIX
+)
diff --git a/cmake/setupXSIMD.cmake b/cmake/setupXSIMD.cmake
index 475412514..779d87575 100644
--- a/cmake/setupXSIMD.cmake
+++ b/cmake/setupXSIMD.cmake
@@ -1,18 +1,3 @@
-CPMAddPackage(
-    NAME
-    xtl
-    GIT_REPOSITORY
-    "https://github.com/xtensor-stack/xtl.git"
-    GIT_TAG
-    ${XTL_VERSION}
-    EXCLUDE_FROM_ALL
-    YES
-    GIT_SHALLOW
-    YES
-    OPTIONS
-    "XTL_DISABLE_EXCEPTIONS YES"
-)
-
 CPMAddPackage(
     NAME
     xsimd
@@ -26,5 +11,4 @@ CPMAddPackage(
     YES
     OPTIONS
     "XSIMD_SKIP_INSTALL YES"
-    "XSIMD_ENABLE_XTL_COMPLEX YES"
 )
diff --git a/cmake/staticAnalysis.cmake b/cmake/staticAnalysis.cmake
new file mode 100644
index 000000000..cc404d37c
--- /dev/null
+++ b/cmake/staticAnalysis.cmake
@@ -0,0 +1,117 @@
+# ---- Optional: tweak defaults without editing code (cache variables) -------
+set(ANALYSIS_HEADER_FILTER
+    "^${PROJECT_SOURCE_DIR}/(src|include)/.*"
+    CACHE STRING
+    "Regex for clang-tidy -header-filter (project files)"
+)
+set(ANALYSIS_CPPCHECK_SUPPRESS_DIRS
+    "${PROJECT_SOURCE_DIR}/external;${PROJECT_SOURCE_DIR}/third_party"
+    CACHE STRING
+    "Semicolon-separated dirs to suppress in cppcheck"
+)
+
+# ---- Find newest clang-tidy ------------------------------------------------
+set(_CLANG_TIDY_CANDIDATES)
+foreach(ver RANGE 22 10 -1)
+    list(APPEND _CLANG_TIDY_CANDIDATES clang-tidy-${ver})
+endforeach()
+list(APPEND _CLANG_TIDY_CANDIDATES clang-tidy)
+find_program(CLANG_TIDY_EXE NAMES ${_CLANG_TIDY_CANDIDATES})
+
+# ---- Find cppcheck (unversioned only) --------------------------------------
+find_program(CPPCHECK_EXE NAMES cppcheck)
+
+# ---- Internal helper: detect if target has any C++ sources -----------------
+function(_target_has_cxx out_var tgt)
+    get_target_property(_srcs ${tgt} SOURCES)
+    set(_has FALSE)
+    foreach(s IN LISTS _srcs)
+        get_source_file_property(_lang "${s}" LANGUAGE)
+        if(_lang STREQUAL "CXX")
+            set(_has TRUE)
+            break()
+        endif()
+        if(NOT _lang OR _lang STREQUAL "NONE")
+            if(s MATCHES "\\.(cc|cpp|cxx|C)($|\\.)")
+                set(_has TRUE)
+                break()
+            endif()
+        endif()
+    endforeach()
+    set(${out_var} ${_has} PARENT_SCOPE)
+endfunction()
+
+# ---- Internal helper: compute C++ standard for target ----------------------
+function(_detect_cxx_standard out_var tgt)
+    get_target_property(_std ${tgt} CXX_STANDARD)
+    if(NOT _std)
+        if(DEFINED CMAKE_CXX_STANDARD)
+            set(_std ${CMAKE_CXX_STANDARD})
+        else()
+            set(_std 17)
+        endif()
+    endif()
+    set(${out_var} ${_std} PARENT_SCOPE)
+endfunction()
+
+# ---- Public: enable analysis for a single target ---------------------------
+function(enable_static_analysis tgt)
+    if(NOT FINUFFT_STATIC_ANALYSIS)
+        return()
+    endif()
+
+    if(NOT TARGET ${tgt})
+        message(FATAL_ERROR "FINUFFT_STATIC_ANALYSIS_for_target: target '${tgt}' not found")
+    endif()
+
+    # Skip header-only libs
+    get_target_property(_type ${tgt} TYPE)
+    if(_type STREQUAL "INTERFACE_LIBRARY")
+        return()
+    endif()
+
+    # Skip targets without C++ sources
+    _target_has_cxx(_has ${tgt})
+    if(NOT _has)
+        return()
+    endif()
+
+    # Determine target C++ standard
+    _detect_cxx_standard(_std ${tgt})
+
+    # ----- clang-tidy per-target -------------------------------------------
+    if(CLANG_TIDY_EXE)
+        set(_TIDY_ARGS -checks=*, -header-filter=${ANALYSIS_HEADER_FILTER}, --extra-arg=-std=c++${_std})
+        set_property(TARGET ${tgt} PROPERTY CXX_CLANG_TIDY "${CLANG_TIDY_EXE};${_TIDY_ARGS}")
+    endif()
+
+    # ----- cppcheck per-target ---------------------------------------------
+    if(CPPCHECK_EXE)
+        set(_CPPCHECK_ARGS
+            --enable=warning,style,performance,portability
+            --inconclusive
+            --inline-suppr
+            --suppress=missingIncludeSystem
+            --std=c++${_std}
+        )
+
+        foreach(d IN LISTS ANALYSIS_CPPCHECK_SUPPRESS_DIRS)
+            if(EXISTS "${d}")
+                list(APPEND _CPPCHECK_ARGS "--suppress=*:${d}/*")
+            endif()
+        endforeach()
+
+        set_property(TARGET ${tgt} PROPERTY CXX_CPPCHECK "${CPPCHECK_EXE};${_CPPCHECK_ARGS}")
+    endif()
+endfunction()
+
+# ---- Public: enable analysis for all targets in this directory -------------
+function(FINUFFT_STATIC_ANALYSIS_here)
+    if(NOT FINUFFT_STATIC_ANALYSIS)
+        return()
+    endif()
+    get_property(_targets DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY BUILDSYSTEM_TARGETS)
+    foreach(t IN LISTS _targets)
+        finufft_static_analysis_for_target(${t})
+    endforeach()
+endfunction()
diff --git a/cmake/toolchain.cmake b/cmake/toolchain.cmake
new file mode 100644
index 000000000..c58cb33d9
--- /dev/null
+++ b/cmake/toolchain.cmake
@@ -0,0 +1,114 @@
+# cmake/toolchain.cmake
+include_guard(GLOBAL)
+
+# Assumes cmake/utils.cmake has already been included by the top-level CMakeLists
+# for: filter_supported_compiler_flags(), check_arch_support(), detect_cuda_architecture()
+
+# ---- Install targets container ------------------------------------------------
+# Keep this accessible globally
+set(INSTALL_TARGETS "" CACHE INTERNAL "FINUFFT install targets list")
+
+# ---- C++ flags (Release / Debug / RelWithDebInfo) ----------------------------
+set(FINUFFT_CXX_FLAGS_RELEASE
+    -funroll-loops
+    -ffp-contract=fast
+    -fno-math-errno
+    -fno-signed-zeros
+    -fno-trapping-math
+    -fassociative-math
+    -freciprocal-math
+    -fmerge-all-constants
+    -ftree-vectorize
+    -fimplicit-constexpr
+    -fcx-limited-range
+    -O3
+    /Ox
+    /fp:contract
+    /fp:except-
+    /GF
+    /GY
+    /GS-
+    /Ob
+    /Oi
+    /Ot
+    /Oy
+)
+filter_supported_compiler_flags(FINUFFT_CXX_FLAGS_RELEASE FINUFFT_CXX_FLAGS_RELEASE)
+message(STATUS "FINUFFT Release flags: ${FINUFFT_CXX_FLAGS_RELEASE}")
+set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO ${FINUFFT_CXX_FLAGS_RELEASE})
+
+set(FINUFFT_CXX_FLAGS_DEBUG
+    -g
+    -g3
+    -ggdb
+    -ggdb3
+    -Wall
+    -Wextra
+    -Wpedantic
+    -Wno-unknown-pragmas
+    /W4
+    /permissive-
+    /wd4068
+)
+filter_supported_compiler_flags(FINUFFT_CXX_FLAGS_DEBUG FINUFFT_CXX_FLAGS_DEBUG)
+message(STATUS "FINUFFT Debug flags: ${FINUFFT_CXX_FLAGS_DEBUG}")
+
+list(APPEND FINUFFT_CXX_FLAGS_RELWITHDEBINFO ${FINUFFT_CXX_FLAGS_RELEASE} ${FINUFFT_CXX_FLAGS_DEBUG})
+message(STATUS "FINUFFT RelWithDebInfo flags: ${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}")
+
+# ---- Architecture flags -------------------------------------------------------
+if(FINUFFT_ARCH_FLAGS STREQUAL "native")
+    set(FINUFFT_ARCH_FLAGS -march=native CACHE STRING "" FORCE)
+    filter_supported_compiler_flags(FINUFFT_ARCH_FLAGS FINUFFT_ARCH_FLAGS)
+    if(NOT FINUFFT_ARCH_FLAGS)
+        set(FINUFFT_ARCH_FLAGS -mtune=native CACHE STRING "" FORCE)
+        filter_supported_compiler_flags(FINUFFT_ARCH_FLAGS FINUFFT_ARCH_FLAGS)
+    endif()
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+        # -march=native emulation for MSVC
+        check_arch_support()
+    endif()
+    if(NOT FINUFFT_ARCH_FLAGS)
+        message(WARNING "No architecture flags are supported by the compiler.")
+    else()
+        message(STATUS "FINUFFT Arch flags: ${FINUFFT_ARCH_FLAGS}")
+    endif()
+endif()
+
+# ---- Default build type -------------------------------------------------------
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Set the default build type to Release" FORCE)
+endif()
+
+# ---- Precision-dependent sources ---------------------------------------------
+set(FINUFFT_PRECISION_DEPENDENT_SOURCES)
+
+# Fortran translation layer when enabled
+if(FINUFFT_BUILD_FORTRAN)
+    list(APPEND FINUFFT_PRECISION_DEPENDENT_SOURCES fortran/finufftfort.cpp)
+endif()
+
+# ---- Sanitizers ---------------------------------------------------------------
+set(FINUFFT_SANITIZER_FLAGS)
+if(FINUFFT_USE_SANITIZERS)
+    set(FINUFFT_SANITIZER_FLAGS
+        -fsanitize=address
+        -fsanitize=undefined
+        -fsanitize=bounds-strict
+        /fsanitize=address
+        /RTC1
+    )
+    filter_supported_compiler_flags(FINUFFT_SANITIZER_FLAGS FINUFFT_SANITIZER_FLAGS)
+    set(FINUFFT_SANITIZER_FLAGS $<$<CONFIG:Debug,RelWithDebInfo>:${FINUFFT_SANITIZER_FLAGS}>)
+endif()
+
+# ---- Top-project features (CTest, Sphinx) ------------------------------------
+if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
+    include(CTest)
+    if(FINUFFT_BUILD_TESTS)
+        enable_testing()
+    endif()
+    if(FINUFFT_BUILD_DOCS)
+        include(setupSphinx)
+    endif()
+endif()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 6eca0a55c..eecdd7edb 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -76,40 +76,142 @@ function(copy_dll source_target destination_target)
     unset(DESTINATION_FILE)
 endfunction()
 
-include(CheckIPOSupported)
-check_ipo_supported(RESULT LTO_SUPPORTED OUTPUT LTO_ERROR)
-
-if(LTO_SUPPORTED)
-    message(STATUS "LTO is supported and enabled.")
-    set(FINUFFT_INTERPROCEDURAL_OPTIMIZATION TRUE)
-else()
-    message(WARNING "LTO is not supported: ${LTO_ERROR}")
-    set(FINUFFT_INTERPROCEDURAL_OPTIMIZATION FALSE)
+if(FINUFFT_INTERPROCEDURAL_OPTIMIZATION)
+    include(CheckIPOSupported)
+    check_ipo_supported(RESULT LTO_SUPPORTED OUTPUT LTO_ERROR)
+    if(NOT LTO_SUPPORTED)
+        message(WARNING "IPO is not supported: ${LTO_ERROR}")
+        set(FINUFFT_INTERPROCEDURAL_OPTIMIZATION OFF)
+    else()
+        message(STATUS "IPO is supported, enabling interprocedural optimization")
+    endif()
 endif()
 
-function(detect_cuda_architecture)
-    find_program(NVIDIA_SMI_EXECUTABLE nvidia-smi)
+function(enable_asan target)
+    target_compile_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS})
+    if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"))
+        target_link_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS})
+    endif()
+endfunction()
 
-    if(NVIDIA_SMI_EXECUTABLE)
-        execute_process(
-            COMMAND ${NVIDIA_SMI_EXECUTABLE} --query-gpu=compute_cap --format=csv,noheader
-            OUTPUT_VARIABLE compute_cap
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-            ERROR_QUIET
-        )
+function(finufft_link_test target)
+    target_link_libraries(${target} PRIVATE finufft::finufft)
+    if(FINUFFT_USE_DUCC0)
+        target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
+    endif()
+    enable_asan(${target})
+    target_compile_features(${target} PRIVATE cxx_std_17)
+    set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE})
+    if(FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
+        target_compile_options(${target} PRIVATE -Wno-deprecated-declarations)
+    endif()
+endfunction()
 
-        if(compute_cap MATCHES "^[0-9]+\\.[0-9]+$")
-            string(REPLACE "." "" arch "${compute_cap}")
-            message(STATUS "Detected CUDA compute capability: ${compute_cap} (sm_${arch})")
+# Requires CMake >= 3.13 (for target_link_options / target_compile_options)
+# Usage:
+#   enable_lto(<target>)        # Clang: ThinLTO, GCC/MSVC/NVCC: full LTO (Clang defaults to Thin)
+#   enable_lto(<target> FULL)   # Force full LTO on Clang
+function(enable_lto target)
+    if(NOT FINUFFT_INTERPROCEDURAL_OPTIMIZATION)
+        return()
+    endif()
+    if(NOT TARGET ${target})
+        message(FATAL_ERROR "enable_lto(): '${target}' is not a target")
+    endif()
 
-            # Pass as list of integers, not string
-            set(CMAKE_CUDA_ARCHITECTURES ${arch} PARENT_SCOPE)
-        else()
-            message(WARNING "Failed to parse compute capability: '${compute_cap}', defaulting to 70")
-            set(CMAKE_CUDA_ARCHITECTURES 70 PARENT_SCOPE)
+    # Optional mode: THIN (default for Clang) or FULL
+    set(_mode "${ARGN}")
+    string(TOLOWER "${_mode}" _mode)
+    if(_mode STREQUAL "full")
+        set(_clang_lto "full")
+    else()
+        set(_clang_lto "thin")
+    endif()
+
+    # Turn on IPO for the target itself (lets CMake add the right flags for that target)
+    set_property(TARGET ${target} PROPERTY INTERPROCEDURAL_OPTIMIZATION ON)
+
+    # Figure out target type (to decide whether to propagate to consumers)
+    get_target_property(_ttype ${target} TYPE)
+    set(_is_lib FALSE)
+    if(
+        _ttype STREQUAL "STATIC_LIBRARY"
+        OR _ttype STREQUAL "SHARED_LIBRARY"
+        OR _ttype STREQUAL "MODULE_LIBRARY"
+        OR _ttype STREQUAL "OBJECT_LIBRARY"
+        OR _ttype STREQUAL "INTERFACE_LIBRARY"
+    )
+        set(_is_lib TRUE)
+    endif()
+
+    # Helper generator expressions
+    set(_C_or_CXX "$<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:")
+    set(_CUDA "$<$<COMPILE_LANGUAGE:CUDA>:")
+    set(_cfg "$<$<NOT:$<CONFIG:Debug>>:")
+
+    if(MSVC)
+        # MSVC: /GL for compile, /LTCG for link; disable incremental linking with LTCG
+        if(NOT _ttype STREQUAL "INTERFACE_LIBRARY")
+            target_compile_options(${target} PRIVATE "${_C_or_CXX}${_cfg}/GL>>")
+            target_link_options(${target} PRIVATE $<$<NOT:$<CONFIG:Debug>>:/LTCG /INCREMENTAL:NO>)
+        endif()
+        if(_is_lib)
+            target_compile_options(${target} INTERFACE "${_C_or_CXX}${_cfg}/GL>>")
+            target_link_options(${target} INTERFACE $<$<NOT:$<CONFIG:Debug>>:/LTCG /INCREMENTAL:NO>)
+        endif()
+    elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        # Clang/AppleClang: prefer ThinLTO by default; lld on non-Apple
+        set(_clang_flag "-flto=${_clang_lto}")
+
+        if(NOT _ttype STREQUAL "INTERFACE_LIBRARY")
+            target_compile_options(${target} PRIVATE "${_C_or_CXX}${_cfg}${_clang_flag}>>")
+            if(APPLE)
+                target_link_options(${target} PRIVATE $<$<NOT:$<CONFIG:Debug>>:${_clang_flag}>)
+            else()
+                target_link_options(${target} PRIVATE $<$<NOT:$<CONFIG:Debug>>:${_clang_flag} -fuse-ld=lld>)
+            endif()
+        endif()
+
+        if(_is_lib)
+            target_compile_options(${target} INTERFACE "${_C_or_CXX}${_cfg}${_clang_flag}>>")
+            if(APPLE)
+                target_link_options(${target} INTERFACE $<$<NOT:$<CONFIG:Debug>>:${_clang_flag}>)
+            else()
+                target_link_options(${target} INTERFACE $<$<NOT:$<CONFIG:Debug>>:${_clang_flag} -fuse-ld=lld>)
+            endif()
+        endif()
+
+        # Prefer llvm-ar/ranlib if available
+        find_program(LLVM_AR NAMES llvm-ar)
+        find_program(LLVM_RANLIB NAMES llvm-ranlib)
+        if(LLVM_AR AND LLVM_RANLIB)
+            set(CMAKE_AR "${LLVM_AR}" PARENT_SCOPE)
+            set(CMAKE_RANLIB "${LLVM_RANLIB}" PARENT_SCOPE)
+        endif()
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        # GCC: -flto; relies on binutils LTO plugin
+        if(NOT _ttype STREQUAL "INTERFACE_LIBRARY")
+            target_compile_options(${target} PRIVATE "${_C_or_CXX}${_cfg}-flto>>")
+            target_link_options(${target} PRIVATE $<$<NOT:$<CONFIG:Debug>>:-flto>)
+        endif()
+        if(_is_lib)
+            target_compile_options(${target} INTERFACE "${_C_or_CXX}${_cfg}-flto>>")
+            target_link_options(${target} INTERFACE $<$<NOT:$<CONFIG:Debug>>:-flto>)
+        endif()
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "NVIDIA")
+        # NVCC: use -dlto for device link-time optimization (CUDA >=11.2)
+        if(NOT _ttype STREQUAL "INTERFACE_LIBRARY")
+            target_compile_options(${target} PRIVATE "${_CUDA}${_cfg}-dlto>>")
+            target_link_options(${target} PRIVATE $<$<NOT:$<CONFIG:Debug>>:-dlto>)
+        endif()
+        if(_is_lib)
+            target_compile_options(${target} INTERFACE "${_CUDA}${_cfg}-dlto>>")
+            target_link_options(${target} INTERFACE $<$<NOT:$<CONFIG:Debug>>:-dlto>)
         endif()
     else()
-        message(WARNING "nvidia-smi not found, defaulting CMAKE_CUDA_ARCHITECTURES to 70")
-        set(CMAKE_CUDA_ARCHITECTURES 70 PARENT_SCOPE)
+        message(
+            WARNING
+            "enable_lto(): unknown compiler '${CMAKE_CXX_COMPILER_ID}'. IPO enabled for '${target}', but flags may not propagate."
+        )
     endif()
 endfunction()
diff --git a/devel/padding.cpp b/devel/padding.cpp
index 727023ba7..e527f1208 100644
--- a/devel/padding.cpp
+++ b/devel/padding.cpp
@@ -4,7 +4,7 @@
 #include <cstdint>
 #include <iostream>
 #include <type_traits>
-#include <xsimd/xsimd.hpp>
+#include <finufft/xsimd.hpp>
 
 template<class T, uint16_t N, uint16_t K = N> static constexpr auto BestSIMDHelper();
 
diff --git a/docs/opts.rst b/docs/opts.rst
index 70817037b..a9b9bea90 100644
--- a/docs/opts.rst
+++ b/docs/opts.rst
@@ -185,6 +185,12 @@ There is thus little reason for the nonexpert to mess with this option.
 
 * ``spread_kerpad=1`` : pad to next multiple of four
 
+**spread_simd**: implementation used for spreading and interpolation.
+
+* ``spread_simd=0`` : automatic choice (currently same as ``2``)
+* ``spread_simd=1`` : scalar loops without vectorization
+* ``spread_simd=2`` : manual SIMD vectorization
+
 
 **upsampfac**: This is the internal factor by which the FFT (fine grid)
 is chosen larger than
@@ -199,6 +205,13 @@ for only two settings, as follows. Otherwise, setting it to zero chooses a good
 * ``upsampfac=1.25`` : low-upsampling option, with lower RAM, smaller FFTs, but wider spreading kernel. The latter can be much faster than the standard when the number of nonuniform points is similar or smaller to the number of modes, and/or if low accuracy is required. It is especially much (2 to 3 times) faster for type 3 transforms. However, the kernel widths :math:`w` are about 50% larger in each dimension, which can lead to slower spreading (it can also be faster due to the smaller size of the fine grid). Because the kernel width is limited to 16, currently, thus only 9-digit accuracy can currently be reached when using ``upsampfac=1.25``.
 
 
+**hint_nj**: Estimated number of nonuniform points available at plan time.
+If nonzero, ``makeplan`` uses this estimate to choose ``upsampfac``.
+Each ``setpts`` call recomputes the factor using the actual ``nj`` and
+re-initializes the plan if it changes.  A value of ``0`` defers the choice
+until ``setpts``.  If ``upsampfac`` is set explicitly, ``hint_nj`` is ignored.
+
+
 **spread_thread**: in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), controls how multithreading is used to spread/interpolate each batch of data.
 
 * ``spread_thread=0`` : makes an automatic choice between the below. Recommended.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index eacb9ff2e..da0175907 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -34,6 +34,7 @@ foreach(EXAMPLE ${EXAMPLES_C})
 endforeach()
 
 if(FINUFFT_USE_OPENMP)
+    find_package(OpenMP REQUIRED)
     foreach(EXAMPLE ${EXAMPLES_OPENMP})
         add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
         target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX)
diff --git a/examples/cuda/example2d1many.cpp b/examples/cuda/example2d1many.cpp
index a5d0ecd5d..dc3da6484 100644
--- a/examples/cuda/example2d1many.cpp
+++ b/examples/cuda/example2d1many.cpp
@@ -9,12 +9,10 @@
 #include <iostream>
 #include <random>
 
+#include "../test/utils/norms.hpp"
 #include <cufinufft.h>
-#include <cufinufft/utils.h>
 
 #include <cuda_runtime.h>
-// FIXME: This isn't actually public, though maybe it should be?
-using cufinufft::utils::infnorm;
 
 int main(int argc, char *argv[])
 /*
@@ -99,7 +97,7 @@ int main(int argc, char *argv[])
     int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
     std::complex<float> Ft = std::complex<float>(0, 0),
                         J  = std::complex<float>(0, 1) * (float)iflag;
-    for (CUFINUFFT_BIGINT j = 0; j < M; ++j)
+    for (auto j = 0UL; j < M; ++j)
       Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
     int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
     printf("[gpu %3d] one mode: abs err in F[%d,%d] is %.3g\n", i, nt1, nt2,
diff --git a/examples/cuda/example2d2many.cpp b/examples/cuda/example2d2many.cpp
index a6b0c6d3e..1427e5d1c 100644
--- a/examples/cuda/example2d2many.cpp
+++ b/examples/cuda/example2d2many.cpp
@@ -5,17 +5,14 @@
 #include <complex>
 #include <iomanip>
 #include <iostream>
-#include <math.h>
+#include <cmath>
 #include <random>
 
+#include "../test/utils/norms.hpp"
 #include <cufinufft.h>
-#include <cufinufft/utils.h>
 
 #include <cuda_runtime.h>
 
-// FIXME: This isn't actually public, though maybe it should be?
-using cufinufft::utils::infnorm;
-
 int main(int argc, char *argv[])
 /*
  * example code for 2D Type 1 transformation.
diff --git a/examples/cuda/example2d3many.cpp b/examples/cuda/example2d3many.cpp
index 66f49d3e7..781886123 100644
--- a/examples/cuda/example2d3many.cpp
+++ b/examples/cuda/example2d3many.cpp
@@ -9,12 +9,10 @@
 #include <random>
 
 #include <cufinufft.h>
-#include <cufinufft/utils.h>
 
-#include <cuda_runtime.h>
+#include "../test/utils/norms.hpp"
 
-// FIXME: This isn't actually public, though maybe it should be?
-using cufinufft::utils::infnorm;
+#include <cuda_runtime.h>
 
 int main(int argc, char *argv[])
 /*
diff --git a/fortran/CMakeLists.txt b/fortran/CMakeLists.txt
index 64d035964..98eb08d49 100644
--- a/fortran/CMakeLists.txt
+++ b/fortran/CMakeLists.txt
@@ -10,7 +10,6 @@ add_library(
 )
 
 set(FORTRAN_EXAMPLES
-    guru1d1
     nufft1d_demo
     nufft2d_demo
     nufft2dmany_demo
@@ -18,10 +17,18 @@ set(FORTRAN_EXAMPLES
     simple1d1
 )
 
+if(NOT FINUFFT_USE_DUCC0)
+    list(APPEND FORTRAN_EXAMPLES guru1d1)
+endif()
 foreach(EXAMPLE ${FORTRAN_EXAMPLES})
     add_executable(fort_${EXAMPLE} examples/${EXAMPLE}.f)
     add_executable(fort_${EXAMPLE}f examples/${EXAMPLE}f.f)
 
-    target_link_libraries(fort_${EXAMPLE} PRIVATE directft finufft ${FINUFFT_FFTLIBS})
-    target_link_libraries(fort_${EXAMPLE}f PRIVATE directft finufft ${FINUFFT_FFTLIBS})
+    target_link_libraries(fort_${EXAMPLE} PRIVATE directft finufft)
+    target_link_libraries(fort_${EXAMPLE}f PRIVATE directft finufft)
+
+    if(BUILD_TESTING AND FINUFFT_BUILD_TESTS)
+        add_test(NAME fort_${EXAMPLE} COMMAND fort_${EXAMPLE})
+        add_test(NAME fort_${EXAMPLE}f COMMAND fort_${EXAMPLE}f)
+    endif()
 endforeach()
diff --git a/include/common/common.h b/include/common/common.h
new file mode 100644
index 000000000..2e525ffb6
--- /dev/null
+++ b/include/common/common.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <common/constants.h>
+#include <common/defines.h>
+#include <common/utils.h>
diff --git a/include/common/constants.h b/include/common/constants.h
new file mode 100644
index 000000000..9bbc27dc9
--- /dev/null
+++ b/include/common/constants.h
@@ -0,0 +1,22 @@
+#pragma once
+
+namespace finufft {
+namespace common {
+
+// constants needed within common
+// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector);
+// also for common
+inline constexpr int MIN_NSPREAD = 2;
+inline constexpr int MAX_NSPREAD = 16;
+// max number of positive quadr nodes
+inline constexpr int MAX_NQUAD = 100;
+// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
+inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1;
+// How many waays there are to evaluate the kernel it should match the avbailable options
+// in finufft_opts
+inline constexpr int KEREVAL_METHODS = 2;
+inline constexpr double PI           = 3.141592653589793238462643383279502884;
+// 1 / (2 * PI)
+inline constexpr double INV_2PI = 0.159154943091895335768883763372514362;
+} // namespace common
+} // namespace finufft
diff --git a/include/common/defines.h b/include/common/defines.h
new file mode 100644
index 000000000..f26e3a7c6
--- /dev/null
+++ b/include/common/defines.h
@@ -0,0 +1,62 @@
+#pragma once
+
+/* IMPORTANT: for Windows compilers, you should add a line
+        #define FINUFFT_DLL
+   here if you are compiling/using FINUFFT as a DLL,
+   in order to do the proper importing/exporting, or
+   alternatively compile with -DFINUFFT_DLL or the equivalent
+   command-line flag.  This is not necessary under MinGW/Cygwin, where
+   libtool does the imports/exports automatically.
+   Alternatively use include(GenerateExportHeader) and
+   generate_export_header(finufft) to auto generate an header containing
+   these defines.The main reason is that if msvc changes the way it deals
+   with it in the future we just need to update cmake for it to work
+   instead of having a check on the msvc version. */
+#if defined(FINUFFT_DLL)
+#if defined(_WIN32) || defined(__WIN32__)
+#if defined(dll_EXPORTS)
+#define FINUFFT_EXPORT __declspec(dllexport)
+#else
+#define FINUFFT_EXPORT __declspec(dllimport)
+#endif
+#else
+#define FINUFFT_EXPORT __attribute__((visibility("default")))
+#endif
+#else
+#define FINUFFT_EXPORT
+#endif
+
+/* specify calling convention (Windows only)
+   The cdecl calling convention is actually not the default in all but a very
+   few C/C++ compilers.
+   If the user code changes the default compiler calling convention, may need
+   this when generating DLL. */
+#if defined(_WIN32) || defined(__WIN32__)
+#define FINUFFT_CDECL __cdecl
+#else
+#define FINUFFT_CDECL
+#endif
+
+// common function attributes
+#if defined(_MSC_VER)
+#define FINUFFT_ALWAYS_INLINE __forceinline
+#define FINUFFT_NEVER_INLINE  __declspec(noinline)
+#define FINUFFT_RESTRICT      __restrict
+#define FINUFFT_UNREACHABLE   __assume(0)
+#define FINUFFT_UNLIKELY(x)   (x)
+#define FINUFFT_LIKELY(x)     (x)
+#elif defined(__GNUC__) || defined(__clang__)
+#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline
+#define FINUFFT_NEVER_INLINE  __attribute__((noinline))
+#define FINUFFT_RESTRICT      __restrict__
+#define FINUFFT_UNREACHABLE   __builtin_unreachable()
+#define FINUFFT_UNLIKELY(x)   __builtin_expect(!!(x), 0)
+#define FINUFFT_LIKELY(x)     __builtin_expect(!!(x), 1)
+#else
+#define FINUFFT_ALWAYS_INLINE inline
+#define FINUFFT_NEVER_INLINE
+#define FINUFFT_RESTRICT
+#define FINUFFT_UNREACHABLE
+#define FINUFFT_UNLIKELY(x) (x)
+#define FINUFFT_LIKELY(x)   (x)
+#endif
diff --git a/include/common/utils.h b/include/common/utils.h
new file mode 100644
index 000000000..dc84cce2e
--- /dev/null
+++ b/include/common/utils.h
@@ -0,0 +1,230 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "defines.h"
+
+namespace finufft {
+namespace common {
+
+FINUFFT_EXPORT void FINUFFT_CDECL gaussquad(int n, double *xgl, double *wgl);
+std::tuple<double, double> leg_eval(int n, double x);
+
+// helper to generate the integer sequence in range [Start, End]
+template<int Offset, typename Seq> struct offset_seq;
+
+template<int Offset, int... I>
+struct offset_seq<Offset, std::integer_sequence<int, I...>> {
+  using type = std::integer_sequence<int, (Offset + I)...>;
+};
+
+template<int Start, int End>
+using make_range =
+    typename offset_seq<Start, std::make_integer_sequence<int, End - Start + 1>>::type;
+
+template<typename Seq> struct DispatchParam {
+  int runtime_val;
+  using seq_type = Seq;
+};
+
+// Cartesian product over integer sequences.
+// Invokes f.template operator()<...>() for each combination of values.
+// The functor F must provide a templated call operator.
+// Adapted upon suggestion from Nils Wentzell: godbolt.org/z/GM94xb1j4
+//
+namespace detail {
+
+template<typename F, typename... Seq> struct Product;
+
+// Recursive case: at least two sequences remaining
+template<typename F, int... I1, typename Seq2, typename... Rest>
+struct Product<F, std::integer_sequence<int, I1...>, Seq2, Rest...> {
+  template<int... Prefix> static void apply(F &f) {
+    (Product<F, Seq2, Rest...>::template apply<Prefix..., I1>(f), ...);
+  }
+};
+
+// Base case: single sequence left
+template<typename F, int... I1> struct Product<F, std::integer_sequence<int, I1...>> {
+  template<int... Prefix> static void apply(F &f) {
+    (f.template operator()<Prefix..., I1>(), ...);
+  }
+};
+
+template<typename F, typename... Seq> void product(F &f, Seq...) {
+  Product<F, Seq...>::template apply<>(f);
+}
+
+// Helper functor invoked for each combination to check runtime values
+template<typename Func, std::size_t N, typename ArgTuple, typename ResultType>
+struct DispatcherCaller {
+  Func &func;
+  const std::array<int, N> &vals;
+  ArgTuple &args;
+  std::conditional_t<std::is_void_v<ResultType>, char, ResultType> result{};
+  template<int... Params> void operator()() {
+    static constexpr std::array<int, sizeof...(Params)> p{Params...};
+    if (p == vals) {
+      if constexpr (std::is_void_v<ResultType>) {
+        std::apply(
+            [&](auto &&...a) {
+              func.template operator()<Params...>(std::forward<decltype(a)>(a)...);
+            },
+            args);
+      } else {
+        result = std::apply(
+            [&](auto &&...a) {
+              return func.template operator()<Params...>(std::forward<decltype(a)>(a)...);
+            },
+            args);
+      }
+    }
+  }
+};
+
+template<typename Seq> struct seq_first;
+template<int I0, int... I>
+struct seq_first<std::integer_sequence<int, I0, I...>> : std::integral_constant<int, I0> {
+};
+
+template<typename Tuple, std::size_t... I>
+auto extract_vals_impl(const Tuple &t, std::index_sequence<I...>) {
+  return std::array<int, sizeof...(I)>{std::get<I>(t).runtime_val...};
+}
+template<typename Tuple> auto extract_vals(const Tuple &t) {
+  using T = std::remove_reference_t<Tuple>;
+  return extract_vals_impl(t, std::make_index_sequence<std::tuple_size_v<T>>{});
+}
+
+template<typename Tuple, std::size_t... I>
+auto extract_seqs_impl(const Tuple &t, std::index_sequence<I...>) {
+  using T = std::remove_reference_t<Tuple>;
+  return std::make_tuple(typename std::tuple_element_t<I, T>::seq_type{}...);
+}
+template<typename Tuple> auto extract_seqs(const Tuple &t) {
+  using T = std::remove_reference_t<Tuple>;
+  return extract_seqs_impl(t, std::make_index_sequence<std::tuple_size_v<T>>{});
+}
+
+template<typename Func, typename ArgTuple, typename... Seq>
+struct dispatch_result_helper {
+  template<std::size_t... I>
+  static auto test(std::index_sequence<I...>)
+      -> decltype(std::declval<Func>().template operator()<seq_first<Seq>::value...>(
+          std::get<I>(std::declval<ArgTuple>())...));
+  using type = decltype(test(std::make_index_sequence<std::tuple_size_v<ArgTuple>>{}));
+};
+template<typename Func, typename ArgTuple, typename SeqTuple> struct dispatch_result;
+template<typename Func, typename ArgTuple, typename... Seq>
+struct dispatch_result<Func, ArgTuple, std::tuple<Seq...>> {
+  using type = typename dispatch_result_helper<Func, ArgTuple, Seq...>::type;
+};
+template<typename Func, typename ArgTuple, typename SeqTuple>
+using dispatch_result_t = typename dispatch_result<Func, ArgTuple, SeqTuple>::type;
+
+} // namespace detail
+
+// Generic dispatcher mapping runtime ints to template parameters.
+// params is a tuple of DispatchParam holding runtime values and sequences.
+// When a match is found, the functor is invoked with those template parameters
+// and its result returned. Otherwise, the default-constructed result is returned.
+template<typename Func, typename ParamTuple, typename... Args>
+decltype(auto) dispatch(Func &&func, ParamTuple &&params, Args &&...args) {
+  using tuple_t           = std::remove_reference_t<ParamTuple>;
+  constexpr std::size_t N = std::tuple_size_v<tuple_t>;
+  auto vals               = detail::extract_vals(params);
+  auto seqs               = detail::extract_seqs(params);
+  auto arg_tuple          = std::forward_as_tuple(std::forward<Args>(args)...);
+  using result_t = detail::dispatch_result_t<Func, decltype(arg_tuple), decltype(seqs)>;
+  detail::DispatcherCaller<Func, N, decltype(arg_tuple), result_t> caller{func, vals,
+                                                                          arg_tuple};
+  std::apply([&](auto &&...s) { detail::product(caller, s...); }, seqs);
+  if constexpr (!std::is_void_v<result_t>) return caller.result;
+}
+
+// Compute number of iterations for half-open range [Start, Stop) with step Inc.
+// Supports Inc > 0 (forward) and Inc < 0 (reverse). Inc must be nonzero.
+template<std::int64_t Start, std::int64_t Stop, std::int64_t Inc>
+inline constexpr std::int64_t compute_range_count = [] {
+  static_assert(Inc != 0, "Inc must not be zero");
+  if constexpr (Inc > 0) {
+    const std::int64_t delta = Stop - Start;
+    return (delta > 0) ? ((delta + Inc - 1) / Inc) : 0;
+  } else {
+    const std::int64_t delta = Start - Stop; // swap for negative step
+    const std::int64_t step  = -Inc;
+    return (delta > 0) ? ((delta + step - 1) / step) : 0;
+  }
+}();
+
+// Low-level block emitter: feeds f(std::integral_constant<std::int64_t, Start +
+// Is*Inc>{}) for Is in [0..N), where N = sizeof...(Is)
+template<std::int64_t Start, std::int64_t Inc, typename F, std::size_t... Is>
+constexpr void static_loop_impl_block(F &&f, std::index_sequence<Is...>) {
+  (f(std::integral_constant<std::int64_t, Start + static_cast<std::int64_t>(Is) * Inc>{}),
+   ...);
+}
+
+// Emit all full-size blocks (compile-time) using an index_sequence of block ids.
+// C++17-friendly: no templated lambdas.
+template<std::int64_t Start, std::int64_t Inc,
+         std::int64_t K, // block (unroll) size, must be > 0
+         typename F, std::size_t... Bs>
+constexpr void static_loop_emit_all_blocks(F &&f, std::index_sequence<Bs...>) {
+  static_assert(K > 0, "K must be positive");
+  (static_loop_impl_block<Start + static_cast<std::int64_t>(Bs) * K * Inc, // block start
+                          Inc>(f,
+                               std::make_index_sequence<static_cast<std::size_t>(K)>{}),
+   ...);
+}
+
+// ----------------------------------------------
+// static_loop
+// ----------------------------------------------
+
+// Full form with Start/Stop/Inc and optional UNROLL.
+// UNROLL defaults to total iteration count. If smaller, we do as many full
+// UNROLL-sized blocks as possible, then a tail of (Count % UNROLL).
+template<std::int64_t Start, std::int64_t Stop, std::int64_t Inc = 1,
+         std::int64_t UNROLL = compute_range_count<Start, Stop, Inc>, typename F>
+constexpr void static_loop(F &&f) {
+  static_assert(Inc != 0, "Inc must not be zero");
+
+  constexpr std::int64_t Count = compute_range_count<Start, Stop, Inc>;
+  if constexpr (Count == 0) {
+    return; // nothing to do
+  } else {
+    // Choose k = UNROLL (if positive), else fall back to Count
+    constexpr std::int64_t k = (UNROLL > 0 ? UNROLL : Count);
+    static_assert(k > 0, "Internal error: k must be positive");
+
+    constexpr std::int64_t Blocks = Count / k; // number of full blocks
+    constexpr std::int64_t Tail   = Count % k; // remainder
+
+    // Emit full k-sized blocks
+    if constexpr (Blocks > 0) {
+      static_loop_emit_all_blocks<Start, Inc, k>(
+          std::forward<F>(f),
+          std::make_index_sequence<static_cast<std::size_t>(Blocks)>{});
+    }
+
+    // Emit tail
+    if constexpr (Tail > 0) {
+      constexpr std::int64_t tail_start = Start + Blocks * k * Inc;
+      static_loop_impl_block<tail_start, Inc>(
+          std::forward<F>(f), std::make_index_sequence<static_cast<std::size_t>(Tail)>{});
+    }
+  }
+}
+
+// Convenience: static_loop<Stop>(f) => Start=0, Inc=1, UNROLL = Count
+template<std::int64_t Stop, typename F> constexpr void static_loop(F &&f) {
+  static_loop<0, Stop, 1, compute_range_count<0, Stop, 1>>(std::forward<F>(f));
+}
+
+} // namespace common
+} // namespace finufft
diff --git a/include/cufinufft.h b/include/cufinufft.h
index 52dd781a0..9cdff6b74 100644
--- a/include/cufinufft.h
+++ b/include/cufinufft.h
@@ -3,6 +3,7 @@
 
 #include <stdint.h>
 
+#include <common/defines.h>
 #include <cufinufft_opts.h>
 #include <finufft_errors.h>
 
@@ -12,23 +13,29 @@ typedef struct cufinufft_fplan_s *cufinufftf_plan;
 #ifdef __cplusplus
 extern "C" {
 #endif
-void cufinufft_default_opts(cufinufft_opts *opts);
-
-int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr,
-                       double eps, cufinufft_plan *d_plan_ptr, cufinufft_opts *opts);
-int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr,
-                        float eps, cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts);
-
-int cufinufft_setpts(cufinufft_plan d_plan, int64_t M, double *d_x, double *d_y,
-                     double *d_z, int N, double *d_s, double *d_t, double *d_u);
-int cufinufftf_setpts(cufinufftf_plan d_plan, int64_t M, float *d_x, float *d_y,
-                      float *d_z, int N, float *d_s, float *d_t, float *d_u);
-
-int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, cuDoubleComplex *d_fk);
-int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c, cuFloatComplex *d_fk);
-
-int cufinufft_destroy(cufinufft_plan d_plan);
-int cufinufftf_destroy(cufinufftf_plan d_plan);
+FINUFFT_EXPORT void cufinufft_default_opts(cufinufft_opts *opts);
+
+FINUFFT_EXPORT int cufinufft_makeplan(int type, int dim, const int64_t *n_modes,
+                                      int iflag, int ntr, double eps,
+                                      cufinufft_plan *d_plan_ptr, cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes,
+                                       int iflag, int ntr, float eps,
+                                       cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts);
+
+FINUFFT_EXPORT int cufinufft_setpts(cufinufft_plan d_plan, int64_t M, double *d_x,
+                                    double *d_y, double *d_z, int N, double *d_s,
+                                    double *d_t, double *d_u);
+FINUFFT_EXPORT int cufinufftf_setpts(cufinufftf_plan d_plan, int64_t M, float *d_x,
+                                     float *d_y, float *d_z, int N, float *d_s,
+                                     float *d_t, float *d_u);
+
+FINUFFT_EXPORT int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c,
+                                     cuDoubleComplex *d_fk);
+FINUFFT_EXPORT int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c,
+                                      cuFloatComplex *d_fk);
+
+FINUFFT_EXPORT int cufinufft_destroy(cufinufft_plan d_plan);
+FINUFFT_EXPORT int cufinufftf_destroy(cufinufftf_plan d_plan);
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h
index caa331bae..6b8ca24a2 100644
--- a/include/cufinufft/defs.h
+++ b/include/cufinufft/defs.h
@@ -1,18 +1,8 @@
 #ifndef CUFINUFFT_DEFS_H
 #define CUFINUFFT_DEFS_H
 
+#include <common/common.h>
 #include <limits>
-// constants needed within common
-// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
-// common
-#define MAX_NSPREAD          16
-#define MIN_NSPREAD          2
-
-// max number of positive quadr nodes
-#define MAX_NQUAD            100
-
-// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
-#define ARRAYWIDCEN_GROWFRAC 0.1
 
 // FIXME: If cufft ever takes N > INT_MAX...
 constexpr int32_t MAX_NF = std::numeric_limits<int32_t>::max();
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 8c6c221ea..9d0d8e216 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -72,6 +72,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       Marco Barbone 07/26/24. Using SM when shared memory available is enough.
   */
   using namespace cufinufft::common;
+  using namespace finufft::common;
   int ier;
   if (type < 1 || type > 3) {
     fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__, type);
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
index 3bc620b29..131a84a39 100644
--- a/include/cufinufft/utils.h
+++ b/include/cufinufft/utils.h
@@ -4,25 +4,21 @@
 // octave (mkoctfile) needs this otherwise it doesn't know what int64_t is!
 #include <complex>
 
-#include <cuComplex.h>
 #include <cufinufft/types.h>
 
 #include <cuda_runtime.h>
 #include <thrust/extrema.h>
+#include <tuple>
 #include <type_traits>
 #include <utility> // for std::forward
 
-#include <finufft_errors.h>
+#include <common/common.h>
 
 #ifndef _USE_MATH_DEFINES
 #define _USE_MATH_DEFINES
 #endif
 #include <cmath>
 
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
 #else
 __inline__ __device__ double atomicAdd(double *address, double val) {
@@ -72,6 +68,8 @@ template<typename T> __forceinline__ __device__ auto interval(const int ns, cons
 namespace cufinufft {
 namespace utils {
 
+using namespace finufft::common;
+
 class WithCudaDevice {
 public:
   explicit WithCudaDevice(const int device) : orig_device_{get_orig_device()} {
@@ -90,19 +88,8 @@ class WithCudaDevice {
   }
 };
 
-// math helpers whose source is in src/cuda/utils.cpp
-CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b);
-void gaussquad(int n, double *xgl, double *wgl);
-std::tuple<double, double> leg_eval(int n, double x);
-
-template<typename T> T infnorm(int n, std::complex<T> *a) {
-  T nrm = 0.0;
-  for (int m = 0; m < n; ++m) {
-    T aa = real(conj(a[m]) * a[m]);
-    if (aa > nrm) nrm = aa;
-  }
-  return sqrt(nrm);
-}
+// math helpers whose source is in src/utils.cpp
+long next235beven(long n, long b);
 
 /**
  * does a complex atomic add on a shared memory address
@@ -124,8 +111,8 @@ static __forceinline__ __device__ void atomicAddComplexShared(
  * on shared memory are supported so we leverage them
  */
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexGlobal(
-    cuda_complex<T> *address, cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex<T> *address,
+                                                              cuda_complex<T> res) {
   if constexpr (
       std::is_same_v<cuda_complex<T>, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) {
     atomicAdd(address, res);
@@ -150,7 +137,7 @@ template<typename T> auto arrayrange(int n, T *a, cudaStream_t stream) {
 
 // Writes out w = half-width and c = center of an interval enclosing all a[n]'s
 // Only chooses a nonzero center if this increases w by less than fraction
-// ARRAYWIDCEN_GROWFRAC defined in defs.h.
+// ARRAYWIDCEN_GROWFRAC defined in common/constants.h.
 // This prevents rephasings which don't grow nf by much. 6/8/17
 // If n==0, w and c are not finite.
 template<typename T> auto arraywidcen(int n, T *a, cudaStream_t stream) {
@@ -180,41 +167,27 @@ auto set_nhg_type3(T S, T X, const cufinufft_opts &opts,
   else
     Ssafe = std::max(Ssafe, T(1) / X);
   // use the safe X and S...
-  T nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / M_PI + nss;
+  T nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss;
   if (!std::isfinite(nfd)) nfd = 0.0; // use FLT to catch inf
   auto nf = (int)nfd;
   // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
   //  catch too small nf, and nan or +-inf, otherwise spread fails...
   if (nf < 2 * spopts.nspread) nf = 2 * spopts.nspread;
-  if (nf < MAX_NF)                   // otherwise will fail anyway
-    nf = utils::next235beven(nf, 1); // expensive at huge nf
+  if (nf < MAX_NF)            // otherwise will fail anyway
+    nf = next235beven(nf, 1); // expensive at huge nf
   // Note: b is 1 because type 3 uses a type 2 plan, so it should not need the extra
   // condition that seems to be used by Block Gather as type 2 are only GM-sort
-  auto h   = 2 * T(M_PI) / nf;                       // upsampled grid spacing
+  auto h   = 2 * T(PI) / nf;                         // upsampled grid spacing
   auto gam = T(nf) / (2.0 * opts.upsampfac * Ssafe); // x scale fac to x'
   return std::make_tuple(nf, h, gam);
 }
 
-// Generalized dispatcher for any function requiring ns-based dispatch
-template<typename Func, typename T, int ns, typename... Args>
-int dispatch_ns(Func &&func, int target_ns, Args &&...args) {
-  if constexpr (ns > MAX_NSPREAD) {
-    return FINUFFT_ERR_METHOD_NOTVALID; // Stop recursion
-  } else {
-    if (target_ns == ns) {
-      return std::forward<Func>(func).template operator()<ns>(
-          std::forward<Args>(args)...);
-    }
-    return dispatch_ns<Func, T, ns + 1>(std::forward<Func>(func), target_ns,
-                                        std::forward<Args>(args)...);
-  }
-}
-
-// Wrapper function that starts the dispatch recursion
+// Wrapper around the generic dispatcher for nspread-based dispatch
 template<typename Func, typename T, typename... Args>
-int launch_dispatch_ns(Func &&func, int target_ns, Args &&...args) {
-  return dispatch_ns<Func, T, MIN_NSPREAD>(std::forward<Func>(func), target_ns,
-                                           std::forward<Args>(args)...);
+auto launch_dispatch_ns(Func &&func, int target_ns, Args &&...args) {
+  using NsSeq = make_range<MIN_NSPREAD, MAX_NSPREAD>;
+  auto params = std::make_tuple(DispatchParam<NsSeq>{target_ns});
+  return dispatch(std::forward<Func>(func), params, std::forward<Args>(args)...);
 }
 
 /**
diff --git a/include/finufft.fh b/include/finufft.fh
index f86911b9a..baf87a8ea 100644
--- a/include/finufft.fh
+++ b/include/finufft.fh
@@ -14,8 +14,9 @@ c     diagnostic opts...
 
 c     alg performance opts...
       integer nthreads, fftw, spread_sort, spread_kerevalmeth
-      integer spread_kerpad
+      integer spread_kerpad, spread_simd
       real*8 upsampfac
+      integer*8 hint_nj
       integer spread_thread, maxbatchsize, spread_nthr_atomic
       integer spread_max_sp_size
       integer fftw_lock_fun, fftw_unlock_fun, fftw_lock_data
diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h
index 9aaf36595..223fb2a0e 100644
--- a/include/finufft/finufft_core.h
+++ b/include/finufft/finufft_core.h
@@ -1,69 +1,12 @@
 #ifndef FINUFFT_CORE_H
 #define FINUFFT_CORE_H
 
-#include <xsimd/xsimd.hpp>
-
 #include <array>
-#include <finufft_errors.h>
+#include <cstdint>
 #include <memory>
 
-/* IMPORTANT: for Windows compilers, you should add a line
-        #define FINUFFT_DLL
-   here if you are compiling/using FINUFFT as a DLL,
-   in order to do the proper importing/exporting, or
-   alternatively compile with -DFINUFFT_DLL or the equivalent
-   command-line flag.  This is not necessary under MinGW/Cygwin, where
-   libtool does the imports/exports automatically.
-   Alternatively use include(GenerateExportHeader) and
-   generate_export_header(finufft) to auto generate an header containing
-   these defines.The main reason is that if msvc changes the way it deals
-   with it in the future we just need to update cmake for it to work
-   instead of having a check on the msvc version. */
-#if defined(FINUFFT_DLL) && (defined(_WIN32) || defined(__WIN32__))
-#if defined(dll_EXPORTS)
-#define FINUFFT_EXPORT __declspec(dllexport)
-#else
-#define FINUFFT_EXPORT __declspec(dllimport)
-#endif
-#else
-#define FINUFFT_EXPORT
-#endif
-
-/* specify calling convention (Windows only)
-   The cdecl calling convention is actually not the default in all but a very
-   few C/C++ compilers.
-   If the user code changes the default compiler calling convention, may need
-   this when generating DLL. */
-#if defined(_WIN32) || defined(__WIN32__)
-#define FINUFFT_CDECL __cdecl
-#else
-#define FINUFFT_CDECL
-#endif
-
-// inline macro, to force inlining of small functions
-// this avoids the use of macros to implement functions
-#if defined(_MSC_VER)
-#define FINUFFT_ALWAYS_INLINE __forceinline inline
-#define FINUFFT_NEVER_INLINE  __declspec(noinline)
-#define FINUFFT_RESTRICT      __restrict
-#define FINUFFT_UNREACHABLE   __assume(0)
-#define FINUFFT_UNLIKELY(x)   (x)
-#define FINUFFT_LIKELY(x)     (x)
-#elif defined(__GNUC__) || defined(__clang__)
-#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline
-#define FINUFFT_NEVER_INLINE  __attribute__((noinline))
-#define FINUFFT_RESTRICT      __restrict__
-#define FINUFFT_UNREACHABLE   __builtin_unreachable()
-#define FINUFFT_UNLIKELY(x)   __builtin_expect(!!(x), 0)
-#define FINUFFT_LIKELY(x)     __builtin_expect(!!(x), 1)
-#else
-#define FINUFFT_ALWAYS_INLINE inline
-#define FINUFFT_NEVER_INLINE
-#define FINUFFT_RESTRICT
-#define FINUFFT_UNREACHABLE
-#define FINUFFT_UNLIKELY(x) (x)
-#define FINUFFT_LIKELY(x)   (x)
-#endif
+#include "common/common.h"
+#include "finufft_errors.h"
 
 // All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
 // This includes all calling arguments (eg M,N) that could be huge someday.
@@ -75,20 +18,6 @@ using UBIGINT = uint64_t;
 // Library version (is a string)
 #define FINUFFT_VER "2.5.0dev"
 
-// Smallest possible kernel spread width per dimension, in fine grid points
-// (used only in spreadinterp.cpp)
-inline constexpr int MIN_NSPREAD = 2;
-
-// Largest possible kernel spread width per dimension, in fine grid points
-// (used only in spreadinterp.cpp)
-inline constexpr int MAX_NSPREAD = 16;
-
-// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
-inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1;
-
-// Max number of positive quadr nodes for kernel FT (used only in common.cpp)
-inline constexpr int MAX_NQUAD = 100;
-
 // Internal (nf1 etc) array allocation size that immediately raises error.
 // (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.)
 // Increase this if you need >10TB (!) RAM...
@@ -98,10 +27,6 @@ inline constexpr BIGINT MAX_NF = BIGINT(1e12);
 // values for M = nj (also nk in type 3)...
 inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14);
 
-// We define our own PI here because M_PI is not actually part of standard C++
-inline constexpr double PI      = 3.14159265358979329;
-inline constexpr double INV_2PI = 0.159154943091895336;
-
 // ----- OpenMP macros which also work when omp not present -----
 // Allows compile-time switch off of openmp, so compilation without any openmp
 // is done (Note: _OPENMP is automatically set by -fopenmp compile flag)
@@ -196,6 +121,10 @@ template<typename TF> struct FINUFFT_PLAN_T { // the main plan class, fully C++
   finufft_opts opts; // this and spopts could be made ptrs
   finufft_spread_opts spopts;
 
+  bool upsamp_locked;
+
+  int init_spreader_and_fft();
+
   // Remaining actions (not create/delete) in guru interface are now methods...
   int setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF *s, TF *t, TF *u);
   int execute_internal(TC *cj, TC *fk, bool adjoint = false, int ntrans_actual = -1,
diff --git a/include/finufft/finufft_utils.hpp b/include/finufft/finufft_utils.hpp
index 08cf4d372..3117c040e 100644
--- a/include/finufft/finufft_utils.hpp
+++ b/include/finufft/finufft_utils.hpp
@@ -3,12 +3,9 @@
 
 #pragma once
 
-#include <cmath>
-//  for CNTime...
-//  (use chrono since the interface is portable between linux and windows)
-#include <chrono>
-
 #include "finufft_core.h"
+#include <cmath>
+#include <common/common.h>
 
 namespace finufft::utils {
 
@@ -31,7 +28,7 @@ FINUFFT_EXPORT FINUFFT_ALWAYS_INLINE void FINUFFT_CDECL arraywidcen(BIGINT n, co
                                                                     T *w, T *c)
 // Writes out w = half-width and c = center of an interval enclosing all a[n]'s
 // Only chooses a nonzero center if this increases w by less than fraction
-// ARRAYWIDCEN_GROWFRAC defined in finufft_core.h.
+// ARRAYWIDCEN_GROWFRAC defined in common/constants.h.
 // This prevents rephasings which don't grow nf by much. 6/8/17
 // If n==0, w and c are not finite.
 {
@@ -39,7 +36,7 @@ FINUFFT_EXPORT FINUFFT_ALWAYS_INLINE void FINUFFT_CDECL arraywidcen(BIGINT n, co
   arrayrange(n, a, &lo, &hi);
   *w = (hi - lo) / 2;
   *c = (hi + lo) / 2;
-  if (std::abs(*c) < ARRAYWIDCEN_GROWFRAC * (*w)) {
+  if (std::abs(*c) < common::ARRAYWIDCEN_GROWFRAC * (*w)) {
     *w += std::abs(*c);
     *c = 0.0;
   }
@@ -47,8 +44,6 @@ FINUFFT_EXPORT FINUFFT_ALWAYS_INLINE void FINUFFT_CDECL arraywidcen(BIGINT n, co
 
 // routines in finufft_utils.cpp ...
 FINUFFT_EXPORT BIGINT next235even(BIGINT n);
-FINUFFT_EXPORT void gaussquad(int n, double *xgl, double *wgl);
-FINUFFT_EXPORT std::tuple<double, double> leg_eval(int n, double x);
 
 // jfm's timer class
 class FINUFFT_EXPORT CNTime {
diff --git a/include/finufft/heuristics.hpp b/include/finufft/heuristics.hpp
index 1f9cb22ed..45704c94c 100644
--- a/include/finufft/heuristics.hpp
+++ b/include/finufft/heuristics.hpp
@@ -332,7 +332,7 @@ double bestUpsamplingFactor(const int nthreads, const double density, const int
   }
 
   // 2) Special-case for nufftType == 3
-  //    TODO: maybe use the bandwidth here?
+  //    TODO: use the bandwidth to populate the talbe.
   if (nufftType == 3) {
     return 1.25;
   }
diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h
index 0a0dd9a5f..16df1e4b8 100644
--- a/include/finufft/test_defs.h
+++ b/include/finufft/test_defs.h
@@ -15,6 +15,7 @@
 #include <finufft.h>
 
 // convenient private finufft internals that tests need
+#include <common/common.h>
 #include <finufft/finufft_core.h>
 #include <finufft/finufft_utils.hpp>
 #include <memory>
@@ -36,6 +37,7 @@ using CPX = std::complex<FLT>;
 
 // -------------- Math consts (not in math.h) and useful math macros ----------
 #include <cmath>
+using ::finufft::common::PI;
 
 // either-precision unit imaginary number...
 #define IMA (CPX(0.0, 1.0))
@@ -116,7 +118,6 @@ static inline CPX crandm11r [[maybe_unused]] (unsigned int *x) {
 // --------  FINUFFT's plan object, prec-switching version ------------------
 // NB: now private (the public C++ or C etc user sees an opaque pointer to it)
 
-#include <finufft/fft.h>
 struct FINUFFT_PLAN_S : public FINUFFT_PLAN_T<FLT> {};
 
 // std stuff for tester src
diff --git a/include/finufft/xsimd.hpp b/include/finufft/xsimd.hpp
new file mode 100644
index 000000000..54cd7a827
--- /dev/null
+++ b/include/finufft/xsimd.hpp
@@ -0,0 +1,14 @@
+#ifndef FINUFFT_XSIMD_HPP
+#define FINUFFT_XSIMD_HPP
+
+#include <xsimd/config/xsimd_config.hpp>
+
+#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE
+#undef XSIMD_NO_SUPPORTED_ARCHITECTURE
+#define XSIMD_WITH_EMULATED 1
+#define XSIMD_DEFAULT_ARCH  emulated<128>
+#endif
+
+#include <xsimd/xsimd.hpp>
+
+#endif // FINUFFT_XSIMD_HPP
diff --git a/include/finufft_eitherprec.h b/include/finufft_eitherprec.h
index a7ea02e40..d534d6020 100644
--- a/include/finufft_eitherprec.h
+++ b/include/finufft_eitherprec.h
@@ -36,38 +36,7 @@
 // the plan object pointed to... (doesn't need to be even defined here)
 #define FINUFFT_PLAN_S FINUFFTIFY(_plan_s)
 
-/* IMPORTANT: for Windows compilers, you should add a line
-        #define FINUFFT_DLL
-   here if you are compiling/using FINUFFT as a DLL,
-   in order to do the proper importing/exporting, or
-   alternatively compile with -DFINUFFT_DLL or the equivalent
-   command-line flag.  This is not necessary under MinGW/Cygwin, where
-   libtool does the imports/exports automatically.
-   Alternatively use include(GenerateExportHeader) and
-   generate_export_header(finufft) to auto generate an header containing
-   these defines.The main reason is that if msvc changes the way it deals
-   with it in the future we just need to update cmake for it to work
-   instead of having a check on the msvc version. */
-#if defined(FINUFFT_DLL) && (defined(_WIN32) || defined(__WIN32__))
-#if defined(dll_EXPORTS)
-#define FINUFFT_EXPORT __declspec(dllexport)
-#else
-#define FINUFFT_EXPORT __declspec(dllimport)
-#endif
-#else
-#define FINUFFT_EXPORT
-#endif
-
-/* specify calling convention (Windows only)
-   The cdecl calling convention is actually not the default in all but a very
-   few C/C++ compilers.
-   If the user code changes the default compiler calling convention, may need
-   this when generating DLL. */
-#if defined(_WIN32) || defined(__WIN32__)
-#define FINUFFT_CDECL __cdecl
-#else
-#define FINUFFT_CDECL
-#endif
+#include <common/defines.h>
 
 ////////////////////////////////////////////////////////////////////
 // PUBLIC METHOD INTERFACES. All are C-style even when used from C++...
diff --git a/include/finufft_mod.f90 b/include/finufft_mod.f90
index 5618d5078..00bf5d3e5 100644
--- a/include/finufft_mod.f90
+++ b/include/finufft_mod.f90
@@ -15,8 +15,9 @@ module finufft_mod
 
    ! alg perf opts...
    integer(kind=C_INT) :: nthreads,fftw,spread_sort,spread_kerevalmeth
-   integer(kind=C_INT) :: spread_kerpad
+   integer(kind=C_INT) :: spread_kerpad, spread_simd
    real(kind=C_DOUBLE) :: upsampfac
+   integer(kind=C_SIZE_T) :: hint_nj
    integer(kind=C_INT) :: spread_thread, maxbatchsize
    integer(kind=C_INT) :: spread_nthr_atomic, spread_max_sp_size
    integer(kind=C_SIZE_T) :: fftw_lock_fun, fftw_unlock_fun, fftw_lock_data
diff --git a/include/finufft_opts.h b/include/finufft_opts.h
index 381e888d3..f1ed4c6e0 100644
--- a/include/finufft_opts.h
+++ b/include/finufft_opts.h
@@ -5,6 +5,8 @@
 #ifndef FINUFFT_OPTS_H
 #define FINUFFT_OPTS_H
 
+#include <stddef.h>
+
 typedef struct finufft_opts { // defaults see finufft_core.cpp:finufft_default_opts_t()
   // sphinx tag (don't remove): @opts_start
   // FINUFFT options:
@@ -25,7 +27,9 @@ typedef struct finufft_opts { // defaults see finufft_core.cpp:finufft_default_o
   int spread_sort;        // spreader: 0 don't sort, 1 do, or 2 heuristic choice
   int spread_kerevalmeth; // spreader: 0 exp(sqrt()), 1 Horner piecewise poly (faster)
   int spread_kerpad;      // (exp(sqrt()) only): 0 don't pad kernel to 4n, 1 do
+  int spread_simd;        // 0 auto(=2), 1 scalar, 2 manual vectorization
   double upsampfac;       // upsampling ratio sigma: 2.0 std, 1.25 small FFT, 0.0 auto
+  size_t hint_nj;         // estimated nj at plan time; 0 means unknown
   int spread_thread;      // (vectorized ntr>1 only): 0 auto, 1 seq multithreaded,
                           //                          2 parallel single-thread spread
   int maxbatchsize;       // (vectorized ntr>1 only): max transform batch, 0 auto
diff --git a/include/finufft_spread_opts.h b/include/finufft_spread_opts.h
index 28f72f5b4..2c3e9f4f8 100644
--- a/include/finufft_spread_opts.h
+++ b/include/finufft_spread_opts.h
@@ -16,6 +16,7 @@ typedef struct finufft_spread_opts {
   int kerevalmeth;         // 0: direct exp(sqrt()), or 1: Horner ppval, fastest
   int kerpad;              // 0: no pad w to mult of 4, 1: do pad
                            // (this helps SIMD for kerevalmeth=0, eg on i7).
+  int simd;                // 1 scalar, 2 manual vectorization
   int nthreads;            // # threads for spreadinterp (0: use max avail)
   int sort_threads;        // # threads for sort (0: auto-choice up to nthreads)
   int max_subproblem_size; // # pts per t1 subprob; sets extra RAM per thread
diff --git a/iwyu.imp b/iwyu.imp
new file mode 100644
index 000000000..75dcde526
--- /dev/null
+++ b/iwyu.imp
@@ -0,0 +1,16 @@
+[
+  { "include": [ "@<xsimd/.*>",      "private", "<xsimd/xsimd.hpp>",   "public" ] },
+  { "include": [ "@\"xsimd/.*\"",    "private", "<xsimd/xsimd.hpp>",   "public" ] },
+
+  { "include": [ "@<common/.*>",     "private", "\"common/common.h\"", "public" ] },
+  { "include": [ "@\"common/.*\"",   "private", "\"common/common.h\"", "public" ] },
+
+  { "include": [ "@<thrust/(.*)>",   "private", "<thrust/\\1>",        "public" ] },
+  { "include": [ "@\"thrust/(.*)\"", "private", "<thrust/\\1>",        "public" ] },
+
+  { "include": [ "@<ducc/(.*)>",     "private", "<ducc/\\1>",          "public" ] },
+  { "include": [ "@\"ducc/(.*)\"",   "private", "<ducc/\\1>",          "public" ] },
+
+  { "include": [ "@<fftw3/.*>",      "private", "<fftw3.h>",           "public" ] },
+  { "include": [ "@\"fftw3/.*\"",    "private", "<fftw3.h>",           "public" ] }
+]
diff --git a/makefile b/makefile
index 883285a91..04420ab47 100644
--- a/makefile
+++ b/makefile
@@ -61,12 +61,12 @@ DEPS_ROOT := deps
 
 # xsimd header-only dependency repo
 XSIMD_URL := https://github.com/xtensor-stack/xsimd.git
-XSIMD_VERSION := 13.0.0
+XSIMD_VERSION := 13.2.0
 XSIMD_DIR := $(DEPS_ROOT)/xsimd
 
 # DUCC sources optional dependency repo
 DUCC_URL := https://gitlab.mpcdf.mpg.de/mtr/ducc.git
-DUCC_VERSION := ducc0_0_35_0
+DUCC_VERSION := ducc0_0_38_0
 DUCC_DIR := $(DEPS_ROOT)/ducc
 # this dummy file used as empty target by make...
 DUCC_COOKIE := $(DUCC_DIR)/.finufft_has_ducc
@@ -136,7 +136,7 @@ STATICLIB = lib-static/$(LIBNAME).a
 ABSDYNLIB = $(FINUFFT)$(DYNLIB)
 
 # spreader objs
-SOBJS = src/finufft_utils.o src/spreadinterp.o
+SOBJS = src/finufft_utils.o src/utils.o src/spreadinterp.o
 
 # all lib dual-precision objs (note DUCC_OBJS empty if unused)
 OBJS = $(SOBJS) src/fft.o src/finufft_core.o src/c_interface.o fortran/finufftfort.o $(DUCC_OBJS)
@@ -174,7 +174,7 @@ usage:
 	@echo "Also see docs/install.rst and docs/README"
 
 # collect headers for implicit depends (we don't separate public from private)
-HEADERS = $(wildcard include/*.h include/finufft/*.h)
+HEADERS = $(wildcard include/*.h include/finufft/*.h include/common/*.h)
 
 # implicit rules for objects (note -o ensures writes to correct dir)
 %.o: %.cpp $(HEADERS)
@@ -262,10 +262,10 @@ test/%: test/%.cpp $(DYNLIB)
 test/%f: test/%.cpp $(DYNLIB)
 	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@
 # low-level tests that are cleaner if depend on only specific objects...
-test/testutils: test/testutils.cpp src/finufft_utils.o
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/finufft_utils.o $(LIBS) -o test/testutils
-test/testutilsf: test/testutils.cpp src/finufft_utils.o
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/finufft_utils.o $(LIBS) -o test/testutilsf
+test/testutils: test/testutils.cpp src/finufft_utils.o src/utils.o
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/finufft_utils.o src/utils.o $(LIBS) -o test/testutils
+test/testutilsf: test/testutils.cpp src/finufft_utils.o src/utils.o
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/finufft_utils.o src/utils.o $(LIBS) -o test/testutilsf
 
 # make sure all double-prec test executables ready for testing
 TESTS := $(basename $(wildcard test/*.cpp))
@@ -457,22 +457,26 @@ docker-wheel:
 # ================== SETUP/COMPILE OF EXTERNAL DEPENDENCIES ===============
 
 define clone_repo
-    @if [ ! -d "$(3)" ]; then \
-        echo "Cloning repository $(1) at tag $(2) into directory $(3)"; \
-        git clone --depth=1 --branch $(2) $(1) $(3); \
-    else \
-        cd $(3) && \
-        CURRENT_VERSION=$$(git describe --tags --abbrev=0) && \
-        if [ "$$CURRENT_VERSION" = "$(2)" ]; then \
-            echo "Directory $(3) already exists and is at the correct version $(2)."; \
-        else \
-            echo "Directory $(3) exists but is at version $$CURRENT_VERSION. Checking out the correct version $(2)."; \
-            git fetch --tags && \
-            git checkout $(2) || { echo "Error: Failed to checkout version $(2) in $(3)."; exit 1; }; \
-        fi; \
-    fi
+	@if [ ! -d "$(3)" ]; then \
+		echo "Cloning repository $(1) at tag $(2) into directory $(3)"; \
+		git clone --no-checkout $(1) $(3) && \
+		cd $(3) && \
+		git fetch origin tag $(2) --force && \
+		git -c advice.detachedHead=false checkout $(2); \
+	else \
+		cd $(3) && \
+		CURRENT_VERSION=$$(git describe --tags --abbrev=0 2>/dev/null || echo ""); \
+		if [ "$$CURRENT_VERSION" = "$(2)" ]; then \
+			echo "Directory $(3) already exists and is at the correct version $(2)."; \
+		else \
+			echo "Directory $(3) exists but is at version $$CURRENT_VERSION. Checking out the correct version $(2)."; \
+			git fetch origin tag $(2) --force && \
+			git -c advice.detachedHead=false checkout $(2) || { echo "Error: Failed to checkout version $(2) in $(3)."; exit 1; }; \
+		fi; \
+	fi
 endef
 
+
 # download: header-only, no compile needed...
 $(XSIMD_DIR)/include/xsimd/xsimd.hpp:
 	mkdir -p $(DEPS_ROOT)
diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt
index c7aa4ca4c..8734c4ed0 100644
--- a/matlab/CMakeLists.txt
+++ b/matlab/CMakeLists.txt
@@ -1,12 +1,143 @@
-find_package(Matlab REQUIRED)
-matlab_add_mex(NAME finufft_mex SRC finufft.cpp LINK_TO finufft OUTPUT_NAME finufft R2018a)
-target_compile_definitions(finufft_mex PRIVATE -DR2008OO)
-
-file(GLOB FINUFFT_MATLAB_M_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.m)
-
-add_custom_command(
-    TARGET finufft_mex
-    POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${FINUFFT_MATLAB_M_SOURCES} ${CMAKE_CURRENT_BINARY_DIR}
-    VERBATIM
-)
+# Ensure MATLAB is on PATH if provided by setup-matlab
+find_package(Matlab REQUIRED COMPONENTS MEX_COMPILER MX_LIBRARY)
+
+# MEX extension and common defines
+set(MEX_EXT "${Matlab_MEX_EXTENSION}")
+set(FINUFFT_MEX_DEFS R2008OO)
+
+# Make MEX self-contained where we can
+function(make_mex_self_contained tgt)
+    if(UNIX AND NOT APPLE)
+        # Avoid MATLAB's older libstdc++ at runtime on Linux
+        target_link_options(${tgt} PRIVATE -static-libstdc++ -static-libgcc)
+    endif()
+endfunction()
+
+# ------------------------------------------------------------
+# --- CPU MEX (uses 'mex' via matlab_add_mex) ---
+if(FINUFFT_USE_CPU)
+    matlab_add_mex(
+        NAME finufft_mex
+        SRC "${CMAKE_CURRENT_SOURCE_DIR}/finufft.cpp"
+        OUTPUT_NAME "finufft"
+        R2018a
+        LINK_TO finufft
+    )
+    target_compile_definitions(finufft_mex PRIVATE ${FINUFFT_MEX_DEFS})
+    target_include_directories(finufft_mex PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>)
+    make_mex_self_contained(finufft_mex)
+endif()
+
+# ------------------------------------------------------------
+# --- CUDA MEX (uses 'mexcuda' via matlab_add_mex) ---
+if(FINUFFT_USE_CUDA)
+    find_path(
+        Matlab_GPU_INCLUDE_DIR
+        NAMES gpu/mxGPUArray.h
+        HINTS
+            "${Matlab_ROOT_DIR}/extern/include"
+            "${Matlab_ROOT_DIR}/toolbox/parallel/gpu/extern/include"
+            "${Matlab_ROOT_DIR}/toolbox/distcomp/gpu/extern/include"
+    )
+    if(NOT Matlab_GPU_INCLUDE_DIR)
+        message(
+            FATAL_ERROR
+            "MATLAB GPU header mxGPUArray.h not found. \
+            Ensure Parallel Computing Toolbox is installed, or add its include path."
+        )
+    endif()
+    matlab_add_mex(
+        NAME cufinufft_mex
+        SRC "${CMAKE_CURRENT_SOURCE_DIR}/cufinufft.cu"
+        OUTPUT_NAME "cufinufft"
+        R2018a
+        LINK_TO cufinufft Matlab::mex Matlab::mx CUDA::cudart
+    )
+    target_compile_definitions(cufinufft_mex PRIVATE ${FINUFFT_MEX_DEFS})
+    target_include_directories(cufinufft_mex PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>)
+    target_include_directories(cufinufft_mex PRIVATE $<BUILD_INTERFACE:${Matlab_GPU_INCLUDE_DIR}>)
+    make_mex_self_contained(cufinufft_mex)
+endif()
+
+# ------------------------------------------------------------
+# MATLAB assets (.m and .docsrc files)
+file(GLOB MATLAB_MFILES "${CMAKE_CURRENT_SOURCE_DIR}/*.m" "${CMAKE_CURRENT_SOURCE_DIR}/*.docsrc")
+
+# Copy .m files next to built MEX(es) for in-tree testing
+if(TARGET finufft_mex)
+    add_custom_command(
+        TARGET finufft_mex
+        POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${MATLAB_MFILES} $<TARGET_FILE_DIR:finufft_mex>
+    )
+endif()
+if(TARGET cufinufft_mex)
+    add_custom_command(
+        TARGET cufinufft_mex
+        POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${MATLAB_MFILES} $<TARGET_FILE_DIR:cufinufft_mex>
+    )
+endif()
+
+# ------------------------------------------------------------
+# RPATH for installed artifacts
+# Option A (macOS): don't overwrite MATLAB's rpaths; let them propagate.
+foreach(tgt IN ITEMS finufft_mex cufinufft_mex)
+    if(TARGET ${tgt})
+        if(APPLE)
+            set_target_properties(
+                ${tgt}
+                PROPERTIES
+                    BUILD_WITH_INSTALL_RPATH OFF
+                    INSTALL_RPATH
+                        "" # do not clobber MATLAB rpaths
+                    INSTALL_RPATH_USE_LINK_PATH
+                        TRUE # propagate link-line rpaths
+            )
+        elseif(UNIX AND NOT APPLE)
+            # Side-by-side loading on Linux (your own shared libs next to the MEX)
+            set_target_properties(${tgt} PROPERTIES INSTALL_RPATH "\$ORIGIN" INSTALL_RPATH_USE_LINK_PATH TRUE)
+        endif()
+    endif()
+endforeach()
+
+# ------------------------------------------------------------
+# Install & CPack (optional)
+if(FINUFFT_MATLAB_INSTALL)
+    if(CMAKE_BUILD_TYPE STREQUAL "Release")
+        set(CPACK_STRIP_FILES TRUE)
+    endif()
+
+    # Install MATLAB assets
+    if(MATLAB_MFILES)
+        install(FILES ${MATLAB_MFILES} DESTINATION matlab)
+    endif()
+
+    # Install class folder (if present)
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/@gpuArray")
+        install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/@gpuArray" DESTINATION matlab)
+    endif()
+
+    # Install the MEX targets into matlab/
+    if(TARGET finufft_mex)
+        install(TARGETS finufft_mex RUNTIME DESTINATION matlab LIBRARY DESTINATION matlab ARCHIVE DESTINATION matlab)
+    endif()
+    if(TARGET cufinufft_mex)
+        install(TARGETS cufinufft_mex RUNTIME DESTINATION matlab LIBRARY DESTINATION matlab ARCHIVE DESTINATION matlab)
+    endif()
+
+    # CPack: produce a clean archive with ONLY top-level "matlab/"
+    set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
+    if(WIN32)
+        set(CPACK_GENERATOR "ZIP")
+        set(CPACK_PACKAGE_FILE_NAME "finufft-matlab-mex-windows")
+    elseif(APPLE)
+        set(CPACK_GENERATOR "TGZ")
+        set(CPACK_PACKAGE_FILE_NAME "finufft-matlab-mex-macos-${CMAKE_OSX_DEPLOYMENT_TARGET}")
+    else()
+        set(CPACK_GENERATOR "TGZ")
+        set(CPACK_PACKAGE_FILE_NAME "finufft-matlab-mex-linux")
+    endif()
+
+    include(CPack)
+endif()
diff --git a/matlab/finufft.mw b/matlab/finufft.mw
index 4f82493c6..270695e0e 100644
--- a/matlab/finufft.mw
+++ b/matlab/finufft.mw
@@ -79,6 +79,9 @@ $     }
 $     else if (strcmp(fname[ifield],"spread_kerpad") == 0) {
 $       oc->spread_kerpad = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
 $     }
+$     else if (strcmp(fname[ifield],"spread_simd") == 0) {
+$       oc->spread_simd = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
+$     }
 $     else if (strcmp(fname[ifield],"fftw") == 0) {
 $       oc->fftw = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
 $     }
diff --git a/matlab/opts.docbit b/matlab/opts.docbit
index 1dbc841a5..8055d5ee4 100644
--- a/matlab/opts.docbit
+++ b/matlab/opts.docbit
@@ -4,6 +4,7 @@
 %     opts.spread_sort:  0 (don't sort NU pts), 1 (do), 2 (auto, default)
 %     opts.spread_kerevalmeth:  0: exp(sqrt()), 1: Horner ppval (faster)
 %     opts.spread_kerpad: (iff kerevalmeth=0)  0: don't pad to mult of 4, 1: do
+%     opts.spread_simd:  0 auto(=2), 1 scalar, 2 manual vectorization
 %     opts.fftw: FFTW plan mode, 64=FFTW_ESTIMATE (default), 0=FFTW_MEASURE, etc
 %     opts.upsampfac:   sigma.  2.0 (default), or 1.25 (low RAM, smaller FFT)
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
diff --git a/python/finufft/finufft/_finufft.py b/python/finufft/finufft/_finufft.py
index 593fc5b16..a581b4a70 100644
--- a/python/finufft/finufft/_finufft.py
+++ b/python/finufft/finufft/_finufft.py
@@ -12,6 +12,7 @@
 from ctypes import c_float
 from ctypes import c_int
 from ctypes import c_longlong
+from ctypes import c_size_t
 from ctypes import c_void_p
 
 import numpy as np
@@ -81,7 +82,9 @@ class FinufftOpts(ctypes.Structure):
                       ('spread_sort', c_int),
                       ('spread_kerevalmeth', c_int),
                       ('spread_kerpad', c_int),
+                      ('spread_simd', c_int),
                       ('upsampfac', c_double),
+                      ('hint_nj', c_size_t),
                       ('spread_thread', c_int),
                       ('maxbatchsize', c_int),
                       ('spread_nthr_atomic', c_int),
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 000000000..3ce6f700f
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,106 @@
+include(staticAnalysis)
+include(setupIWYU)
+
+if((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
+    add_link_options("-ld_classic")
+endif()
+
+set(FINUFFT_FFTW_LIBRARIES)
+include(setupXSIMD)
+if(FINUFFT_USE_DUCC0)
+    include(setupDUCC)
+else()
+    include(setupFFTW)
+endif()
+if(FINUFFT_USE_DUCC0)
+    set(FINUFFT_FFTLIBS ducc0)
+else()
+    set(FINUFFT_FFTLIBS ${FINUFFT_FFTW_LIBRARIES})
+endif()
+if(FINUFFT_USE_OPENMP)
+    find_package(OpenMP COMPONENTS C CXX REQUIRED)
+endif()
+
+set(FINUFFT_SOURCES
+    spreadinterp.cpp
+    fft.cpp
+    finufft_core.cpp
+    c_interface.cpp
+    finufft_utils.cpp
+    utils.cpp
+)
+
+if(FINUFFT_BUILD_FORTRAN)
+    list(APPEND FINUFFT_SOURCES ${PROJECT_SOURCE_DIR}/fortran/finufftfort.cpp)
+endif()
+
+if(FINUFFT_STATIC_LINKING)
+    add_library(finufft STATIC ${FINUFFT_SOURCES})
+else()
+    add_library(finufft SHARED ${FINUFFT_SOURCES})
+endif()
+
+enable_static_analysis(finufft)
+add_iwyu_fix_target(finufft)
+enable_lto(finufft FULL)
+
+add_library(finufft::finufft ALIAS finufft)
+
+if(FINUFFT_USE_DUCC0)
+    target_compile_definitions(finufft PRIVATE FINUFFT_USE_DUCC0)
+endif()
+
+target_link_libraries(finufft PUBLIC $<BUILD_INTERFACE:${FINUFFT_FFTLIBS}>)
+
+target_link_libraries(finufft PRIVATE $<BUILD_INTERFACE:xsimd>)
+if(FINUFFT_USE_OPENMP)
+    target_link_libraries(finufft PRIVATE OpenMP::OpenMP_CXX)
+    if(NOT FINUFFT_STATIC_LINKING)
+        target_link_libraries(finufft INTERFACE OpenMP::OpenMP_CXX)
+    endif()
+    target_link_options(finufft PRIVATE ${OpenMP_CXX_FLAGS})
+    target_include_directories(finufft PRIVATE $<BUILD_INTERFACE:${OpenMP_CXX_INCLUDE_DIR}>)
+endif()
+
+if(FINUFFT_SHARED_LINKING)
+    target_compile_definitions(finufft PRIVATE FINUFFT_DLL)
+    if(WIN32)
+        target_compile_definitions(finufft PRIVATE dll_EXPORTS)
+    endif()
+endif()
+
+set_target_properties(finufft PROPERTIES CXX_VISIBILITY_PRESET hidden VISIBILITY_INLINES_HIDDEN YES)
+
+find_library(MATH_LIBRARY m)
+if(MATH_LIBRARY)
+    target_link_libraries(finufft PRIVATE ${MATH_LIBRARY})
+endif()
+
+target_include_directories(finufft PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include>)
+
+target_compile_features(finufft PRIVATE cxx_std_17)
+target_compile_options(finufft PRIVATE ${FINUFFT_ARCH_FLAGS})
+target_compile_options(finufft PRIVATE $<$<CONFIG:Release>:${FINUFFT_CXX_FLAGS_RELEASE}>)
+target_compile_options(finufft PRIVATE $<$<CONFIG:RelWithDebInfo>:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>)
+target_compile_options(finufft PRIVATE $<$<CONFIG:Debug>:${FINUFFT_CXX_FLAGS_DEBUG}>)
+target_include_directories(finufft PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>)
+target_include_directories(finufft SYSTEM INTERFACE $<INSTALL_INTERFACE:include>)
+set_target_properties(
+    finufft
+    PROPERTIES
+        POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
+        INTERPROCEDURAL_OPTIMIZATION ${FINUFFT_INTERPROCEDURAL_OPTIMIZATION}
+)
+enable_asan(finufft)
+
+if(FINUFFT_ENABLE_INSTALL)
+    file(GLOB FINUFFT_PUBLIC_HEADERS "${PROJECT_SOURCE_DIR}/include/finufft*.h")
+    set_target_properties(finufft PROPERTIES PUBLIC_HEADER "${FINUFFT_PUBLIC_HEADERS}")
+endif()
+
+set(FINUFFT_FFTW_LIBRARIES "${FINUFFT_FFTW_LIBRARIES}" PARENT_SCOPE)
+set(FINUFFT_FFTLIBS "${FINUFFT_FFTLIBS}" PARENT_SCOPE)
+
+set(_targets ${INSTALL_TARGETS})
+list(APPEND _targets finufft)
+set(INSTALL_TARGETS "${_targets}" PARENT_SCOPE)
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index 2df2aab54..28873d0af 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -1,4 +1,6 @@
-set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp)
+include(staticAnalysis)
+include(setupIWYU)
+set(PRECISION_INDEPENDENT_SRC precision_independent.cu ../utils.cpp)
 
 set(PRECISION_DEPENDENT_SRC
     spreadinterp.cpp
@@ -18,39 +20,50 @@ set(PRECISION_DEPENDENT_SRC
 )
 
 set(CUFINUFFT_INCLUDE_DIRS
-    ${PROJECT_SOURCE_DIR}/include
-    ${PROJECT_SOURCE_DIR}/contrib
+    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/contrib>
+    $<INSTALL_INTERFACE:include>
     $<TARGET_PROPERTY:CUDA::cudart,INTERFACE_INCLUDE_DIRECTORIES>
     $<TARGET_PROPERTY:CUDA::cufft,INTERFACE_INCLUDE_DIRECTORIES>
 )
 
 set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE)
 
-# flush denormals to zero and enable verbose PTXAS output
-if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
-    list(APPEND FINUFFT_CUDA_FLAGS $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda --extra-device-vectorization>)
-endif()
+list(APPEND FINUFFT_CUDA_FLAGS $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda --extra-device-vectorization>)
 
 if(FINUFFT_SHARED_LINKING)
     add_library(cufinufft SHARED ${PRECISION_INDEPENDENT_SRC} ${PRECISION_DEPENDENT_SRC})
 else()
     add_library(cufinufft STATIC ${PRECISION_INDEPENDENT_SRC} ${PRECISION_DEPENDENT_SRC})
 endif()
+
+enable_static_analysis(cufinufft)
+add_iwyu_fix_target(cufinufft)
+enable_lto(cufinufft FULL)
+add_library(finufft::cufinufft ALIAS cufinufft)
 target_include_directories(cufinufft PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
 # set target build location
 set_target_properties(cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}")
 
+set(FINUFFT_CUDA_VISIBILITY_PRESET hidden)
+if(FINUFFT_BUILD_TESTS)
+    set(FINUFFT_CUDA_VISIBILITY_PRESET default)
+endif()
+
 set_target_properties(
     cufinufft
     PROPERTIES
         CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
         CUDA_SEPARABLE_COMPILATION ON
-        WINDOWS_EXPORT_ALL_SYMBOLS ON
         ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-        INTERPROCEDURAL_OPTIMIZATION
-            OFF # LTO is not supported for CUDA for now
+        INTERPROCEDURAL_OPTIMIZATION ${FINUFFT_INTERPROCEDURAL_OPTIMIZATION}
         POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
         LINKER_LANGUAGE CUDA
+        CXX_VISIBILITY_PRESET ${FINUFFT_CUDA_VISIBILITY_PRESET}
+        CUDA_VISIBILITY_PRESET ${FINUFFT_CUDA_VISIBILITY_PRESET}
+        VISIBILITY_INLINES_HIDDEN YES
+        CUDA_RUNTIME_LIBRARY Shared
+        WINDOWS_EXPORT_ALL_SYMBOLS NO
 )
 
 if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true" AND MSVC)
@@ -61,12 +74,25 @@ else()
 endif()
 target_compile_options(cufinufft PUBLIC ${FINUFFT_CUDA_FLAGS})
 
-if(WIN32 OR (BUILD_TESTING AND FINUFFT_BUILD_TESTS) OR CIBUILDWHEEL)
+if(FINUFFT_SHARED_LINKING)
+    target_compile_definitions(cufinufft PRIVATE FINUFFT_DLL)
+    if(WIN32)
+        target_compile_definitions(cufinufft PRIVATE dll_EXPORTS)
+    endif()
+endif()
+
+target_link_libraries(cufinufft PRIVATE CUDA::cudart CUDA::cufft)
+# Expose only when not doing fully static linking
+if(NOT FINUFFT_STATIC_LINKING)
     target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft)
-else()
-    target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static)
 endif()
-target_link_libraries(cufinufft PUBLIC CCCL::CCCL)
+
+target_link_libraries(cufinufft PRIVATE CCCL::CCCL)
+if(NOT FINUFFT_STATIC_LINKING)
+    target_link_libraries(cufinufft INTERFACE CCCL::CCCL)
+endif()
+
+check_cxx_compiler_flag(-Wno-deprecated-declarations FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
 # disable deprecated warnings for tests if supported
 if(FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
     target_compile_options(cufinufft PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Wno-deprecated-declarations>)
diff --git a/src/cuda/common.cu b/src/cuda/common.cu
index 2b43aadd0..164df2caa 100644
--- a/src/cuda/common.cu
+++ b/src/cuda/common.cu
@@ -1,8 +1,6 @@
 #include <algorithm>
 #include <iomanip>
 #include <iostream>
-#include <limits>
-#include <vector>
 
 #include <cuComplex.h>
 #include <cuda.h>
@@ -10,13 +8,13 @@
 #include <cufinufft/common.h>
 #include <cufinufft/contrib/helper_cuda.h>
 #include <cufinufft/defs.h>
-#include <cufinufft/precision_independent.h>
 #include <cufinufft/spreadinterp.h>
 #include <cufinufft/utils.h>
 
 namespace cufinufft {
 namespace common {
 using namespace cufinufft::spreadinterp;
+using namespace finufft::common;
 using std::max;
 
 /** Kernel for computing approximations of exact Fourier series coeffs of
@@ -201,11 +199,11 @@ void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *phase,
   const auto q = (int)(2 + 3.0 * J2); // matches CPU code
   double z[2 * MAX_NQUAD];
   double w[2 * MAX_NQUAD];
-  cufinufft::utils::gaussquad(2 * q, z, w); // only half the nodes used, for (0,1)
-  for (int n = 0; n < q; ++n) {             // set up nodes z_n and vals f_n
-    z[n] *= J2;                             // rescale nodes
+  gaussquad(2 * q, z, w);       // only half the nodes used, for (0,1)
+  for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n
+    z[n] *= J2;                 // rescale nodes
     f[n]     = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei
-    phase[n] = T(2.0 * M_PI * z[n] / T(nf));               // phase winding rates
+    phase[n] = T(2.0 * PI * z[n] / T(nf));                 // phase winding rates
   }
 }
 
@@ -217,7 +215,7 @@ void onedim_nuft_kernel_precomp(T *f, T *z, finufft_spread_opts opts) {
   int q = (int)(2 + 2.0 * J2); // matches CPU code
   double z_local[2 * MAX_NQUAD];
   double w_local[2 * MAX_NQUAD];
-  cufinufft::utils::gaussquad(2 * q, z_local, w_local);   // half the nodes, (0,1)
+  gaussquad(2 * q, z_local, w_local);                     // half the nodes, (0,1)
   for (int n = 0; n < q; ++n) {                           // set up nodes z_n and vals f_n
     z[n] = J2 * T(z_local[n]);                            // rescale nodes
     f[n] = J2 * w_local[n] * evaluate_kernel(z[n], opts); // vals & quadr wei
diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp
index 832debddd..96210d4ec 100644
--- a/src/cuda/spreadinterp.cpp
+++ b/src/cuda/spreadinterp.cpp
@@ -7,12 +7,14 @@
 
 #include <finufft_errors.h>
 
+using namespace finufft::common;
+
 namespace cufinufft {
 namespace spreadinterp {
 
 template<typename T>
-int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth,
-                   int debug, int spreadinterponly)
+int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac,
+                   int kerevalmeth, int debug, int spreadinterponly)
 // Initializes spreader kernel parameters given desired NUFFT tolerance eps,
 // upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), and ker eval meth
 // (etiher 0:exp(sqrt()), 1: Horner ppval).
@@ -55,9 +57,10 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
   // Set kernel width w (aka ns) and ES kernel beta parameter, in opts...
   int ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of ten
   if (upsampfac != 2.0)                      // override ns for custom sigma
-    ns = std::ceil(-log(eps) / (T(M_PI) * sqrt(1 - 1 / upsampfac))); // formula,
-                                                                     // gamma=1
-  ns = std::max(2, ns);   // we don't have ns=1 version yet
+    ns = std::ceil(
+        -log(eps) / (T(PI) * sqrt(1 - 1 / upsampfac))); // formula,
+                                                                           // gamma=1
+  ns = std::max(2, ns);                      // we don't have ns=1 version yet
   if (ns > MAX_NSPREAD) { // clip to match allocated arrays
     fprintf(stderr,
             "[%s] warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; "
@@ -76,8 +79,9 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
   if (ns == 4) betaoverns = 2.38;
   if (upsampfac != 2.0) { // again, override beta for custom sigma
     T gamma    = 0.97;    // must match devel/gen_all_horner_C_code.m
-    betaoverns = gamma * T(M_PI) * (1 - 1 / (2 * upsampfac)); // formula based on
-                                                              // cutoff
+    betaoverns = gamma * T(PI) * (1 - 1 / (2 * upsampfac)); // formula
+                                                                               // based on
+                                                                               // cutoff
   }
   opts.ES_beta = betaoverns * (T)ns; // set the kernel beta parameter
   if (debug)
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
index e7e9d2cb3..09cd925f4 100644
--- a/src/finufft_core.cpp
+++ b/src/finufft_core.cpp
@@ -1,21 +1,22 @@
-#include <finufft/fft.h>
-#include <finufft/finufft_core.h>
-#include <finufft/finufft_utils.hpp>
-#include <finufft/heuristics.hpp>
-#include <finufft/spreadinterp.h>
-
+#include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <iomanip>
 #include <memory>
 #include <vector>
-#include <xsimd/xsimd.hpp>
+
+#include <finufft/fft.h>
+#include <finufft/finufft_core.h>
+#include <finufft/finufft_utils.hpp>
+#include <finufft/heuristics.hpp>
+#include <finufft/spreadinterp.h>
+#include <finufft/xsimd.hpp>
 
 using namespace finufft;
 using namespace finufft::utils;
 using namespace finufft::spreadinterp;
 using namespace finufft::heuristics;
-
+using namespace finufft::common;
 /* Computational core for FINUFFT.
 
    Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus
@@ -109,6 +110,7 @@ static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps,
   spopts.debug    = opts.spread_debug;
   spopts.sort     = opts.spread_sort;   // could make dim or CPU choices here?
   spopts.kerpad   = opts.spread_kerpad; // (only applies to kerevalmeth=0)
+  spopts.simd     = (opts.spread_simd == 0) ? 2 : opts.spread_simd;
   spopts.nthreads = opts.nthreads;      // 0 passed in becomes omp max by here
   if (opts.spread_nthr_atomic >= 0)     // overrides
     spopts.atomic_threshold = opts.spread_nthr_atomic;
@@ -247,7 +249,8 @@ template<typename T> class KernelFseries {
     int q = (int)(2 + 2.0 * J2); // > pi/2 ratio.  cannot exceed MAX_NQUAD
     if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q);
     std::vector<double> Z(2 * q), W(2 * q);
-    gaussquad(2 * q, Z.data(), W.data()); // only half the nodes used, for (0,1)
+    gaussquad(2 * q, Z.data(), W.data()); // only half the nodes used,
+                                          // for (0,1)
     z.resize(q);
     f.resize(q);
     for (int n = 0; n < q; ++n) {
@@ -537,7 +540,9 @@ void finufft_default_opts_t(finufft_opts *o)
   o->spread_sort        = 2;
   o->spread_kerevalmeth = 1;
   o->spread_kerpad      = 1;
+  o->spread_simd        = 0;
   o->upsampfac          = 0.0;
+  o->hint_nj            = 0;
   o->spread_thread      = 0;
   o->maxbatchsize       = 0;
   o->spread_nthr_atomic = -1;
@@ -549,6 +554,80 @@ void finufft_default_opts_t(finufft_opts *o)
 }
 
 // PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
+template<typename TF> int FINUFFT_PLAN_T<TF>::init_spreader_and_fft() {
+  int ier = setup_spreader_for_nufft(spopts, tol, opts, dim);
+  if (ier > 1) return ier;
+
+  if (type == 1 || type == 2) {
+    int nthr_fft            = opts.nthreads;
+    spopts.spread_direction = type;
+    constexpr TF EPSILON    = std::numeric_limits<TF>::epsilon();
+
+    if (opts.spreadinterponly) {
+      for (int idim = 0; idim < dim; ++idim) nfdim[idim] = mstu[idim];
+      if (opts.debug) {
+        printf("[%s] %dd spreadinterponly(dir=%d): (ms,mt,mu)=(%lld,%lld,%lld)\n         "
+               "      ntrans=%d nthr=%d batchSize=%d kernel width ns=%d",
+               __func__, dim, type, (long long)mstu[0], (long long)mstu[1],
+               (long long)mstu[2], ntrans, opts.nthreads, batchSize, spopts.nspread);
+        if (batchSize == 1)
+          printf("\n");
+        else
+          printf(" spread_thread=%d\n", opts.spread_thread);
+      }
+    } else {
+      if (opts.showwarn) {
+        for (int idim = 0; idim < dim; ++idim)
+          if (EPSILON * mstu[idim] > 1.0)
+            fprintf(stderr,
+                    "%s warning: rounding err (due to cond # of prob) eps_mach*N%d = "
+                    "%.3g > 1 !\n",
+                    __func__, idim, (double)(EPSILON * mstu[idim]));
+      }
+      for (int idim = 0; idim < dim; ++idim) {
+        int nfier = set_nf_type12(mstu[idim], opts, spopts, &nfdim[idim]);
+        if (nfier) return nfier;
+        phiHat[idim].resize(nfdim[idim] / 2 + 1);
+      }
+      if (opts.debug) {
+        printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) (nf1,nf2,nf3)=(%lld,%lld,%lld)\n "
+               "              ntrans=%d nthr=%d batchSize=%d ",
+               __func__, dim, type, (long long)mstu[0], (long long)mstu[1],
+               (long long)mstu[2], (long long)nfdim[0], (long long)nfdim[1],
+               (long long)nfdim[2], ntrans, opts.nthreads, batchSize);
+        if (batchSize == 1)
+          printf("\n");
+        else
+          printf(" spread_thread=%d\n", opts.spread_thread);
+      }
+      CNTime timer;
+      timer.start();
+      for (int idim = 0; idim < dim; ++idim)
+        onedim_fseries_kernel(nfdim[idim], phiHat[idim], spopts);
+      if (opts.debug)
+        printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, spopts.nspread,
+               timer.elapsedsec());
+      if (nf() * batchSize > MAX_NF) {
+        fprintf(
+            stderr,
+            "%s fwBatch would be bigger than MAX_NF, not attempting memory allocation!\n",
+            __func__);
+        return FINUFFT_ERR_MAXNALLOC;
+      }
+      timer.restart();
+      const auto ns = gridsize_for_fft(*this);
+      std::vector<TC, xsimd::aligned_allocator<TC, 64>> fwBatch(nf() * batchSize);
+      fftPlan->plan(ns, batchSize, fwBatch.data(), fftSign, opts.fftw, nthr_fft);
+      if (opts.debug)
+        printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw,
+               nthr_fft, timer.elapsedsec());
+    }
+  } else {
+    if (opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
+  }
+  return ier;
+}
+
 template<typename TF>
 FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, int iflag,
                                    int ntrans_, TF tol_, finufft_opts *opts_, int &ier)
@@ -657,111 +736,28 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
     }
   }
 
-  // heuristic to choose default upsampfac... (currently two poss)
-  if (opts.upsampfac == 0.0) { // init to auto choice
-    // Let assume density=1 as the average use case.
-    // TODO: make a decision on how to choose density properly.
-    const auto density = TF{1};
-    opts.upsampfac     = bestUpsamplingFactor<TF>(opts.nthreads, density, dim, type, tol);
-    if (opts.debug > 1)
-      printf("[%s] threads %d, density %.3g, dim %d, nufft type %d, tol %.3g: auto "
-             "upsampfac=%.2f\n",
-             __func__, opts.nthreads, density, dim, type, tol, opts.upsampfac);
-  }
-  // use opts to choose and write into plan's spread options...
-  ier = setup_spreader_for_nufft(spopts, tol, opts, dim);
-  if (ier > 1) // proceed if success or warning
-    throw int(ier);
-
-  //  ------------------------ types 1,2: planning needed ---------------------
-  if (type == 1 || type == 2) {
-
-    int nthr_fft = nthr; // give FFT all threads (or use o.spread_thread?)
-                         // Note: batchSize not used since might be only 1.
-
-    spopts.spread_direction = type;
-    constexpr TF EPSILON    = std::numeric_limits<TF>::epsilon();
-
-    if (opts.spreadinterponly) { // (unusual case of no NUFFT, just report)
+  upsamp_locked = (opts.upsampfac != 0.0);
 
-      // spreadinterp grid will simply be the user's "mode" grid...
-      for (int idim = 0; idim < dim; ++idim) nfdim[idim] = mstu[idim];
-
-      if (opts.debug) { // "long long" here is to avoid warnings with printf...
-        printf("[%s] %dd spreadinterponly(dir=%d): (ms,mt,mu)=(%lld,%lld,%lld)"
-               "\n               ntrans=%d nthr=%d batchSize=%d kernel width ns=%d",
-               __func__, dim, type, (long long)mstu[0], (long long)mstu[1],
-               (long long)mstu[2], ntrans, nthr, batchSize, spopts.nspread);
-        if (batchSize == 1) // spread_thread has no effect in this case
-          printf("\n");
-        else
-          printf(" spread_thread=%d\n", opts.spread_thread);
-      }
-
-    } else { // ..... usual NUFFT: eval Fourier series, alloc workspace .....
-
-      if (opts.showwarn) { // user warn round-off error (due to prob condition #)...
-        for (int idim = 0; idim < dim; ++idim)
-          if (EPSILON * mstu[idim] > 1.0)
-            fprintf(
-                stderr,
-                "%s warning: rounding err (due to cond # of prob) eps_mach*N%d = %.3g "
-                "> 1 !\n",
-                __func__, idim, (double)(EPSILON * mstu[idim]));
-      }
-
-      // determine fine grid sizes, sanity check, then alloc...
-      for (int idim = 0; idim < dim; ++idim) {
-        int nfier = set_nf_type12(mstu[idim], opts, spopts, &nfdim[idim]);
-        if (nfier) throw nfier;                   // nf too big; we're done
-        phiHat[idim].resize(nfdim[idim] / 2 + 1); // alloc fseries
-      }
-
-      if (opts.debug) { // "long long" here is to avoid warnings with printf...
-        printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
-               "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d "
-               "batchSize=%d ",
-               __func__, dim, type, (long long)mstu[0], (long long)mstu[1],
-               (long long)mstu[2], (long long)nfdim[0], (long long)nfdim[1],
-               (long long)nfdim[2], ntrans, nthr, batchSize);
-        if (batchSize == 1) // spread_thread has no effect in this case
-          printf("\n");
-        else
-          printf(" spread_thread=%d\n", opts.spread_thread);
-      }
+  if (tol < std::numeric_limits<TF>::epsilon()) ier = FINUFFT_WARN_EPS_TOO_SMALL;
 
-      // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
-      timer.restart();
-      for (int idim = 0; idim < dim; ++idim)
-        onedim_fseries_kernel(nfdim[idim], phiHat[idim], spopts);
-      if (opts.debug)
-        printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, spopts.nspread,
-               timer.elapsedsec());
-
-      if (nf() * batchSize > MAX_NF) {
-        fprintf(stderr,
-                "[%s] fwBatch would be bigger than MAX_NF, not attempting memory "
-                "allocation!\n",
-                __func__);
-        throw int(FINUFFT_ERR_MAXNALLOC);
-      }
-
-      timer.restart(); // plan the FFTW (to act in-place on the workspace fwBatch)
-      const auto ns = gridsize_for_fft(*this);
-      std::vector<TC, xsimd::aligned_allocator<TC, 64>> fwBatch(nf() * batchSize);
-      fftPlan->plan(ns, batchSize, fwBatch.data(), fftSign, opts.fftw, nthr_fft);
-      if (opts.debug)
-        printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw,
-               nthr_fft, timer.elapsedsec());
-    }
-
-  } else { // -------------------------- type 3 (no planning) ------------
-
-    if (opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
-    // Type 3 will call finufft_makeplan for type 2; no need to init FFTW
-    // Note we don't even know nj or nk yet, so can't do anything else!
+  if (upsamp_locked) {
+    int ier2 = init_spreader_and_fft();
+    if (ier2 > 1) throw int(ier2);
+    ier = std::max(ier, ier2);
+  } else if (type != 3 && opts.hint_nj > 0) {
+    double density = double(opts.hint_nj) / double(N());
+    opts.upsampfac = bestUpsamplingFactor<TF>(opts.nthreads, density, dim, type, tol);
+    if (opts.debug > 1)
+      printf("[%s] hint_nj=%zu -> auto upsampfac=%.2f\n", __func__, opts.hint_nj,
+             opts.upsampfac);
+    int ier2 = init_spreader_and_fft();
+    if (ier2 > 1) throw int(ier2);
+    ier = std::max(ier, ier2);
+  } else {
+    if (opts.debug > 1)
+      printf("[%s] deferring upsampfac choice until setpts\n", __func__);
   }
-  if (ier > 1) throw int(ier); // report setup_spreader status (could be warning)
+  if (ier > 1) throw int(ier);
 }
 
 template<typename TF>
@@ -800,8 +796,9 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
                                TF *u) {
   // Method function to set NU points and do precomputations. Barnett 2020.
   // See ../docs/cguru.doc for current documentation.
-  int d = dim; // abbrev for spatial dim
-  CNTime timer;
+  int d   = dim; // abbrev for spatial dim
+  int ier = 0;
+  CNTime timer{};
   timer.start();
   this->nj = nj; // the user only now chooses how many NU (x,y,z) pts
   if (nj < 0) {
@@ -812,12 +809,37 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
     return FINUFFT_ERR_NUM_NU_PTS_INVALID;
   }
 
-  if (type != 3) {          // ------------------ TYPE 1,2 SETPTS -------------------
-                            // (all we can do is check and maybe bin-sort the NU pts)
-    XYZ     = {xj, yj, zj}; // plan must keep pointers to user's fixed NU pts
-    int ier = spreadcheck(nfdim[0], nfdim[1], nfdim[2], spopts);
-    if (ier)                // no warnings allowed here
-      return ier;
+  // Helper to merge statuses: keep the worst (max), return fatal (>1) immediately.
+  auto accumulate_status = [&](int code) -> int {
+    if (code > 1) return code; // fatal
+    ier = std::max(ier, code); // 0 or 1: keep the worst so far
+    return 0;                  // 0 indicates "no fatal error"
+  };
+
+  // Lambda to auto-select upsampfac (if not locked) and init spreader/FFT
+  auto update_upsampfac_and_init = [&]() -> int {
+    if (upsamp_locked) return 0;
+    double density = double(nj) / double(N());
+    double new_usf = bestUpsamplingFactor<TF>(opts.nthreads, density, dim, type, tol);
+    if (opts.upsampfac != new_usf) {
+      const char *updated = opts.upsampfac == 0 ? "" : "updated";
+      opts.upsampfac      = new_usf;
+      if (opts.debug > 1)
+        printf("[setpts] %s upsampfac=%.2f (density=%.3f)\n", updated, opts.upsampfac,
+               density);
+    }
+    return init_spreader_and_fft();
+  };
+
+  if (type != 3) { // ------------------ TYPE 1,2 SETPTS -------------------
+                   // (all we can do is check and maybe bin-sort the NU pts)
+    if (!upsamp_locked) {
+      if (int fatal = accumulate_status(update_upsampfac_and_init())) return fatal;
+    }
+    XYZ = {xj, yj, zj}; // plan must keep pointers to user's fixed NU pts
+    if (const auto err = spreadcheck(nfdim[0], nfdim[1], nfdim[2], spopts)) {
+      return err;       // no warnings allowed here
+    }
     timer.restart();
     sortIndices.resize(nj);
     didSort =
@@ -828,7 +850,6 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
 
   } else { // ------------------------- TYPE 3 SETPTS -----------------------
            // (here we can precompute pre/post-phase factors and plan the t2)
-
     std::array<TF *, 3> XYZ_in{xj, yj, zj};
     std::array<TF *, 3> STU_in{s, t, u};
     if (nk < 0) {
@@ -841,21 +862,34 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
     this->nk = nk; // user set # targ freq pts
     STU      = {s, t, u};
 
-    // pick x, s intervals & shifts & # fine grid pts (nf) in each dim...
+    // pick x, s intervals & shifts in each dim...
     std::array<TF, 3> S = {0, 0, 0};
     if (opts.debug) printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk);
     for (int idim = 0; idim < dim; ++idim) {
       arraywidcen(nj, XYZ_in[idim], &(t3P.X[idim]), &(t3P.C[idim]));
-      arraywidcen(nk, STU_in[idim], &S[idim], &(t3P.D[idim])); // same D, S, but for {s_k}
+      arraywidcen(nk, STU_in[idim], &S[idim], &(t3P.D[idim]));
+      // set it here once to compute the density later
+      // set_nhg_type3 will overwrite it
+      mstu[idim] = std::max<BIGINT>(1, (BIGINT)std::ceil(2 * S[idim] * t3P.X[idim] / PI));
+    }
+    for (int idim = dim; idim < 3; ++idim) {
+      t3P.C[idim] = t3P.D[idim] = 0.0;
+      mstu[idim]                = 1;
+    }
+
+    if (!upsamp_locked) {
+      if (int fatal = accumulate_status(update_upsampfac_and_init())) return fatal;
+    }
+
+    for (int idim = 0; idim < dim; ++idim) {
       set_nhg_type3(S[idim], t3P.X[idim], opts, spopts, &(nfdim[idim]), &(t3P.h[idim]),
-                    &(t3P.gam[idim]));                         // applies twist i)
-      if (opts.debug) // report on choices of shifts, centers, etc...
+                    &(t3P.gam[idim]));
+      mstu[idim] = nfdim[idim];
+      if (opts.debug)
         printf("\tX%d=%.3g C%d=%.3g S%d=%.3g D%d=%.3g gam%d=%g nf%d=%lld h%d=%.3g\t\n",
                idim, t3P.X[idim], idim, t3P.C[idim], idim, S[idim], idim, t3P.D[idim],
                idim, t3P.gam[idim], idim, (long long)nfdim[idim], idim, t3P.h[idim]);
     }
-    for (int idim = dim; idim < 3; ++idim)
-      t3P.C[idim] = t3P.D[idim] = 0.0; // their defaults if dim 2 unused, etc
 
     if (nf() * batchSize > MAX_NF) {
       fprintf(stderr,
@@ -941,30 +975,36 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
     t2opts.modeord      = 0;                              // needed for correct t3!
     t2opts.debug        = std::max(0, opts.debug - 1);    // don't print as much detail
     t2opts.spread_debug = std::max(0, opts.spread_debug - 1);
-    t2opts.showwarn     = 0;                              // so don't see warnings 2x
+    t2opts.showwarn     = 0;
+    if (!upsamp_locked) t2opts.upsampfac = 0; // I think the inner t2 can decide what is best here?
+    // so don't see warnings 2x
     // (...could vary other t2opts here?)
     // MR: temporary hack, until we have figured out the C++ interface.
     FINUFFT_PLAN_T<TF> *tmpplan;
-    int ier = finufft_makeplan_t<TF>(2, d, t2nmodes, fftSign, batchSize, tol, &tmpplan,
-                                     &t2opts);
-    if (ier > 1) { // if merely warning, still proceed
+    int status = finufft_makeplan_t<TF>(2, d, t2nmodes, fftSign, batchSize, tol, &tmpplan,
+                                        &t2opts);
+    if (status > 1) { // if merely warning, still proceed
       fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n",
-              __func__, ier);
-      return ier;
+              __func__, status);
+      return status;
     }
-    ier = tmpplan->setpts(nk, STUp[0].data(), STUp[1].data(), STUp[2].data(), 0, nullptr,
-                          nullptr,
-                          nullptr); // note nk = # output points (not nj)
+    if (int fatal = accumulate_status(status)) return fatal;
+
+    status = tmpplan->setpts(nk, STUp[0].data(), STUp[1].data(), STUp[2].data(), 0,
+                             nullptr, nullptr, nullptr); // note nk = # output points
     innerT2plan.reset(tmpplan);
-    if (ier > 1) {
-      fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier);
-      return ier;
+    if (status > 1) {
+      fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, status);
+      return status;
     }
+    if (int fatal = accumulate_status(status)) return fatal;
+
     if (opts.debug)
       printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec());
   }
-  return 0;
+  return ier;
 }
+
 template int FINUFFT_PLAN_T<float>::setpts(BIGINT nj, float *xj, float *yj, float *zj,
                                            BIGINT nk, float *s, float *t, float *u);
 template int FINUFFT_PLAN_T<double>::setpts(BIGINT nj, double *xj, double *yj, double *zj,
diff --git a/src/finufft_utils.cpp b/src/finufft_utils.cpp
index b23e81304..6783339c7 100644
--- a/src/finufft_utils.cpp
+++ b/src/finufft_utils.cpp
@@ -5,9 +5,9 @@
 
 #include <finufft/finufft_utils.hpp>
 
-#include <cstdint>
 #include <iostream>
 #include <string>
+#include <chrono>
 
 #if defined(_WIN32)
 #include <vector>
@@ -45,69 +45,6 @@ BIGINT next235even(BIGINT n)
   return nplus;
 }
 
-void gaussquad(int n, double *xgl, double *wgl) {
-  // n-node Gauss-Legendre quadrature, adapted from a code by Jason Kaye (2022-2023),
-  // from the utils file of https://github.com/flatironinstitute/cppdlr version 1.2,
-  // which is Apache-2 licensed. It uses Newton iteration from Chebyshev points.
-  // Double-precision only.
-  // Adapted by Barnett 6/8/25 to write nodes (xgl) and weights (wgl) into arrays
-  // that the user must pre-allocate to length at least n.
-
-  double x = 0, dx = 0;
-  int convcount = 0;
-
-  // Get Gauss-Legendre nodes
-  xgl[n / 2] = 0;                   // If odd number of nodes, middle node is 0
-  for (int i = 0; i < n / 2; i++) { // Loop through nodes
-    convcount = 0;
-    x         = cos((2 * i + 1) * PI / (2 * n)); // Initial guess: Chebyshev node
-    while (true) {                               // Newton iteration
-      auto [p, dp] = leg_eval(n, x);
-      dx           = -p / dp;
-      x += dx; // Newton step
-      if (std::abs(dx) < 1e-14) {
-        convcount++;
-      }
-      if (convcount == 3) {
-        break;
-      } // If convergence tol hit 3 times, stop
-    }
-    xgl[i]         = -x;
-    xgl[n - i - 1] = x; // Symmetric nodes
-  }
-
-  // Get Gauss-Legendre weights from formula
-  // w_i = -2 / ((n+1)*P_n'(x_i)*P_{n+1}(x_i)) (Atkinson '89, pg. 276)
-  for (int i = 0; i < n / 2 + 1; i++) {
-    auto [junk1, dp] = leg_eval(n, xgl[i]);
-    auto [p, junk2]  = leg_eval(n + 1, xgl[i]); // This is a bit inefficient, but who
-                                                // cares...
-    wgl[i]         = -2 / ((n + 1) * dp * p);
-    wgl[n - i - 1] = wgl[i];
-  }
-}
-
-std::tuple<double, double> leg_eval(int n, double x) {
-  // return Legendre polynomial P_n(x) and its derivative P'_n(x).
-  // Uses Legendre three-term recurrence.
-  // Used by gaussquad above, with which it shares the same authorship and source.
-
-  if (n == 0) {
-    return {1.0, 0.0};
-  }
-  if (n == 1) {
-    return {x, 1.0};
-  }
-  // Three-term recurrence and formula for derivative
-  double p0 = 0.0, p1 = 1.0, p2 = x;
-  for (int i = 1; i < n; i++) {
-    p0 = p1;
-    p1 = p2;
-    p2 = ((2 * i + 1) * x * p1 - i * p0) / (i + 1);
-  }
-  return {p2, n * (x * p2 - p1) / (x * x - 1)};
-}
-
 // ----------------------- helpers for timing (always stay double prec) ------
 
 void CNTime::start() {
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 9f3602cd1..b8e7f3d64 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -6,8 +6,9 @@
 #include "ker_horner_allw_loop_constexpr.h"
 #include "ker_lowupsampfac_horner_allw_loop_constexpr.h"
 
-#include <xsimd/xsimd.hpp>
+#include <finufft/xsimd.hpp>
 
+#include <array>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
@@ -15,7 +16,8 @@
 #include <vector>
 
 using namespace std;
-using namespace finufft::utils; // access to timer
+using namespace finufft::utils;  // access to timer
+using namespace finufft::common; // access to constants and dispatch
 
 namespace finufft::spreadinterp {
 
@@ -43,17 +45,68 @@ template<unsigned cap> struct shuffle_index {
     return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index;
   }
 };
-struct select_even {
+struct [[maybe_unused]] select_even {
   static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index * 2; }
 };
-struct select_odd {
+struct [[maybe_unused]] select_odd {
   static constexpr unsigned get(unsigned index, unsigned /*size*/) {
     return index * 2 + 1;
   }
 };
 
-// this finds the largest SIMD instruction set that can handle N elements
-// void otherwise -> compile error
+#if defined(XSIMD_WITH_EMULATED) && XSIMD_WITH_EMULATED
+
+template<class T, uint8_t N = 1> constexpr uint8_t min_simd_width() {
+  return static_cast<uint8_t>(xsimd::batch<T>::size);
+}
+
+template<class T, uint8_t N> constexpr uint8_t find_optimal_simd_width() {
+  return static_cast<uint8_t>(xsimd::batch<T>::size);
+}
+
+template<class T, uint8_t N> constexpr uint8_t GetPaddedSIMDWidth() {
+  return static_cast<uint8_t>(xsimd::batch<T>::size);
+}
+
+template<class T, uint8_t N> using PaddedSIMD = xsimd::batch<T>;
+
+// --- compile-time padding for ns ---
+template<class T, uint8_t ns> constexpr uint8_t get_padding() {
+  constexpr uint8_t width = GetPaddedSIMDWidth<T, ns>();
+  const unsigned w        = width ? width : 1u;
+  const unsigned padded   = ((static_cast<unsigned>(ns) + w - 1u) / w) * w;
+  return static_cast<uint8_t>(padded - ns);
+}
+
+template<class T, uint8_t ns> constexpr uint8_t get_padding_helper(uint8_t runtime_ns) {
+  if constexpr (ns < 2) {
+    return 0;
+  } else {
+    return (runtime_ns == ns)
+               ? get_padding<T, ns>()
+               : get_padding_helper<T, static_cast<uint8_t>(ns - 1)>(runtime_ns);
+  }
+}
+
+template<class T> inline uint8_t get_padding(uint8_t ns) {
+  if (ns == 0) return 0;
+  if (ns > 2 * MAX_NSPREAD) ns = static_cast<uint8_t>(2 * MAX_NSPREAD);
+  return get_padding_helper<T, static_cast<uint8_t>(2 * MAX_NSPREAD)>(ns);
+}
+
+// --- "best SIMD" type selection (trivial in emulated) ---
+template<class T, uint8_t N, uint8_t K = 0>
+static constexpr xsimd::batch<T> BestSIMDHelper() {
+  return {};
+}
+
+template<class T, uint8_t N> using BestSIMD = xsimd::batch<T>;
+
+#else
+// NATIVE BACKENDS
+
+// ---- your original helpers (unchanged) ----
+
 template<class T, uint8_t N, uint8_t K = N> static constexpr auto BestSIMDHelper() {
   if constexpr (N % K == 0) { // returns void in the worst case
     return xsimd::make_sized_batch<T, K>{};
@@ -61,26 +114,28 @@ template<class T, uint8_t N, uint8_t K = N> static constexpr auto BestSIMDHelper
     return BestSIMDHelper<T, N, (K >> 1)>();
   }
 }
+
 template<class T, uint8_t N = 1> constexpr uint8_t min_simd_width() {
   // finds the smallest simd width that can handle N elements
   // simd size is batch size the SIMD width in xsimd terminology
   if constexpr (std::is_void_v<xsimd::make_sized_batch_t<T, N>>) {
-    return min_simd_width<T, N * 2>();
+    return min_simd_width<T, static_cast<uint8_t>(N * 2)>();
   } else {
     return N;
   }
-};
+}
 
-template<class T, uint8_t N> constexpr auto find_optimal_simd_width() {
+template<class T, uint8_t N> constexpr uint8_t find_optimal_simd_width() {
   // finds the smallest simd width that minimizes the number of iterations
   // NOTE: might be suboptimal for some cases 2^N+1 for example
   // in the future we might want to implement a more sophisticated algorithm
   uint8_t optimal_simd_width = min_simd_width<T>();
-  uint8_t min_iterations     = (N + optimal_simd_width - 1) / optimal_simd_width;
+  uint8_t min_iterations =
+      static_cast<uint8_t>((N + optimal_simd_width - 1) / optimal_simd_width);
   for (uint8_t simd_width = optimal_simd_width;
        simd_width <= xsimd::batch<T, xsimd::best_arch>::size;
-       simd_width *= 2) {
-    uint8_t iterations = (N + simd_width - 1) / simd_width;
+       simd_width = static_cast<uint8_t>(simd_width * 2)) {
+    uint8_t iterations = static_cast<uint8_t>((N + simd_width - 1) / simd_width);
     if (iterations < min_iterations) {
       min_iterations     = iterations;
       optimal_simd_width = simd_width;
@@ -89,48 +144,52 @@ template<class T, uint8_t N> constexpr auto find_optimal_simd_width() {
   return optimal_simd_width;
 }
 
-template<class T, uint8_t N> constexpr auto GetPaddedSIMDWidth() {
+template<class T, uint8_t N> constexpr uint8_t GetPaddedSIMDWidth() {
   // helper function to get the SIMD width with padding for the given number of elements
   // that minimizes the number of iterations
   return xsimd::make_sized_batch<T, find_optimal_simd_width<T, N>()>::type::size;
 }
+
 template<class T, uint8_t N>
 using PaddedSIMD = typename xsimd::make_sized_batch<T, GetPaddedSIMDWidth<T, N>()>::type;
+
 template<class T, uint8_t ns> constexpr auto get_padding() {
-  // helper function to get the padding for the given number of elements
-  // ns is known at compile time, rounds ns to the next multiple of the SIMD width
-  // then subtracts ns to get the padding using a bitwise and trick
-  // WARING: this trick works only for power of 2s
-  // SOURCE: Agner Fog's VCL manual
+  // WARING: your original used a PoT bit trick; to be safe, use a generic
+  // ceil-to-multiple:
   constexpr uint8_t width = GetPaddedSIMDWidth<T, ns>();
-  return ((ns + width - 1) & (-width)) - ns;
+  const unsigned w        = width ? width : 1u;
+  const unsigned padded   = ((static_cast<unsigned>(ns) + w - 1u) / w) * w;
+  return static_cast<uint8_t>(padded - ns);
 }
 
+// ---- guard these too (runtime API) ----
 template<class T, uint8_t ns> constexpr auto get_padding_helper(uint8_t runtime_ns) {
-  // helper function to get the padding for the given number of elements where ns is
-  // known at runtime, it uses recursion to find the padding
-  // this allows to avoid having a function with a large number of switch cases
-  // as GetPaddedSIMDWidth requires a compile time value
-  // it cannot be a lambda function because of the template recursion
   if constexpr (ns < 2) {
-    return 0;
+    return static_cast<uint8_t>(0);
   } else {
     if (runtime_ns == ns) {
       return get_padding<T, ns>();
     } else {
-      return get_padding_helper<T, ns - 1>(runtime_ns);
+      return get_padding_helper<T, static_cast<uint8_t>(ns - 1)>(runtime_ns);
     }
   }
 }
 
-template<class T> uint8_t get_padding(uint8_t ns) {
+template<class T> inline uint8_t get_padding(uint8_t ns) {
   // return the padding as a function of the number of elements
   // 2 * MAX_NSPREAD is the maximum number of elements that we can have
   // that's why is hardcoded here
-  return get_padding_helper<T, 2 * MAX_NSPREAD>(ns);
+#ifndef MAX_NSPREAD
+#define MAX_NSPREAD 16
+#endif
+  return get_padding_helper<T, static_cast<uint8_t>(2 * MAX_NSPREAD)>(ns);
 }
+
 template<class T, uint8_t N>
 using BestSIMD = typename decltype(BestSIMDHelper<T, N, xsimd::batch<T>::size>())::type;
+
+#endif // XSIMD_WITH_EMULATED
+
 template<class T, class V, size_t... Is>
 constexpr T generate_sequence_impl(V a, V b, index_sequence<Is...>) noexcept {
   // utility function to generate a sequence of a, b interleaved as function arguments
@@ -183,6 +242,65 @@ template<typename T> FINUFFT_ALWAYS_INLINE auto xsimd_to_array(const T &vec) noe
   return array;
 }
 
+// ---- Sum even and odd indices from a plain array ----
+template<typename T, std::size_t N, std::size_t... I>
+FINUFFT_ALWAYS_INLINE constexpr std::array<T, 2> chsum_even_odd_from_array_impl(
+    const std::array<T, N> &a, std::index_sequence<I...>) noexcept {
+  static_assert(N % 2 == 0, "Array size must be even for channel sum.");
+  const T s_even = (T{} + ... + a[I * 2]);
+  const T s_odd  = (T{} + ... + a[I * 2 + 1]);
+  return std::array<T, 2>{s_even, s_odd};
+}
+
+template<typename T, std::size_t N>
+FINUFFT_ALWAYS_INLINE constexpr std::array<T, 2> chsum_even_odd_from_array(
+    const std::array<T, N> &a) noexcept {
+  return chsum_even_odd_from_array_impl<T, N>(a, std::make_index_sequence<N / 2>{});
+}
+
+// ---- Array-based fallback: works for any Batch ----
+template<typename Batch>
+FINUFFT_ALWAYS_INLINE auto chsum_fallback_arrayized(const Batch &v) noexcept
+    -> std::array<typename Batch::value_type, 2> {
+  using T                 = typename Batch::value_type;
+  constexpr std::size_t N = Batch::size;
+  const auto a            = xsimd_to_array(v);
+  return chsum_even_odd_from_array<T, N>(a);
+}
+
+// ---- Main chsum: reduce SIMD lanes into {sum_even_lanes, sum_odd_lanes} ----
+// Recursively adds low/high halves until we reach the base width given by
+// min_simd_width<T>(). If half-width SIMD type is unavailable, we fall back to the array
+// method.
+template<typename Batch>
+FINUFFT_ALWAYS_INLINE auto chsum(const Batch &v) noexcept
+    -> std::array<typename Batch::value_type, 2> {
+  using T         = typename Batch::value_type;
+  constexpr int N = static_cast<int>(Batch::size);
+  static_assert(N % 2 == 0, "SIMD batch size must be even.");
+
+  // Base case determined by the smallest supported SIMD width for T (e.g. 4 for float, 2
+  // for double)
+  constexpr int BASE = static_cast<int>(min_simd_width<T>());
+
+  if constexpr (N <= BASE) {
+    // At or below base width: just sum from an array
+    return chsum_fallback_arrayized(v);
+  } else {
+    using half_t = xsimd::make_sized_batch_t<T, static_cast<std::size_t>(N / 2)>;
+    if constexpr (std::is_void<half_t>::value) {
+      // If we can't form a half-width batch, fall back to array method
+      return chsum_fallback_arrayized(v);
+    } else {
+      // Portable split via stack array, then unaligned loads
+      auto a          = xsimd_to_array(v);
+      const auto low  = half_t::load_unaligned(a.data());
+      const auto high = half_t::load_unaligned(a.data() + (N / 2));
+      return chsum(low + high);
+    }
+  }
+}
+
 FINUFFT_NEVER_INLINE
 void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
                         UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3,
@@ -276,47 +394,48 @@ static void evaluate_kernel_vector(T *ker, T *args,
     if (abs(args[i]) >= (T)opts.ES_halfwidth) ker[i] = 0.0;
 }
 
-template<typename T, uint8_t w, uint8_t upsampfact,
+template<typename T, std::uint8_t w, std::uint8_t upsampfact,
          class simd_type =
-             xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, w>()>> // aka ns
-static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(
-    T *FINUFFT_RESTRICT ker, T x,
-    const finufft_spread_opts &opts [[maybe_unused]]) noexcept
-/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
-x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
-This is the current evaluation method, since it's faster (except i7 w=16).
-Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
+             xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, w>()>> // ns = w
+static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(T *FINUFFT_RESTRICT ker, T x,
+                                                         const finufft_spread_opts &opts
+                                                         [[maybe_unused]]) noexcept {
+  // scale so local grid offset z in [-1,1]
+  const T z                       = std::fma(T(2.0), x, T(w - 1));
+  using arch_t                    = typename simd_type::arch_type;
+  static constexpr auto alignment = arch_t::alignment();
+  static constexpr auto simd_size = simd_type::size;
+  static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1);
 
-{
-  // scale so local grid offset z in[-1,1]
-  const T z                           = std::fma(T(2.0), x, T(w - 1));
-  using arch_t                        = typename simd_type::arch_type;
-  static constexpr auto alignment     = arch_t::alignment();
-  static constexpr auto simd_size     = simd_type::size;
-  static constexpr auto padded_ns     = (w + simd_size - 1) & ~(simd_size - 1);
   static constexpr auto horner_coeffs = []() constexpr noexcept {
     if constexpr (upsampfact == 200) {
       return get_horner_coeffs_200<T, w>();
-    } else if constexpr (upsampfact == 125) {
+    } else { // upsampfact == 125
+      static_assert(upsampfact == 125, "Unsupported upsampfact");
       return get_horner_coeffs_125<T, w>();
     }
   }();
-  static constexpr auto nc          = horner_coeffs.size();
-  static constexpr auto use_ker_sym = (simd_size < w);
+  static constexpr auto nc = horner_coeffs.size();
 
   alignas(alignment) static constexpr auto padded_coeffs =
       pad_2D_array_with_zeros<T, nc, w, padded_ns>(horner_coeffs);
 
-  // use kernel symmetry trick if w > simd_size
+  // Compile-time flag for symmetry path
+  static constexpr bool use_ker_sym = (simd_size < w);
+
+  const simd_type zv{z};
+
   if constexpr (use_ker_sym) {
-    static constexpr uint8_t tail          = w % simd_size;
-    static constexpr uint8_t if_odd_degree = ((nc + 1) % 2);
-    static constexpr uint8_t offset_start  = tail ? w - tail : w - simd_size;
-    static constexpr uint8_t end_idx       = (w + (tail > 0)) / 2;
-    const simd_type zv{z};
+    // ---------------- Symmetry path (compile-time) ----------------
+    static constexpr std::uint8_t tail          = w % simd_size;
+    static constexpr std::uint8_t if_odd_degree = ((nc + 1) % 2);
+    static constexpr std::uint8_t end_idx       = (w + (tail > 0)) / 2; // number of left
+                                                                        // elements
+    static constexpr std::uint8_t offset_start = tail ? (w - tail) : (w - simd_size);
+
     const auto z2v = zv * zv;
 
-    // some xsimd constant for shuffle or inverse
+    // shuffle constant (compile-time)
     static constexpr auto shuffle_batch = []() constexpr noexcept {
       if constexpr (tail) {
         return xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<T>, arch_t,
@@ -327,52 +446,73 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
       }
     }();
 
-    // process simd vecs
-    simd_type k_prev, k_sym{0};
-    for (uint8_t i{0}, offset = offset_start; i < end_idx;
-         i += simd_size, offset -= simd_size) {
-      auto k_odd = [i]() constexpr noexcept {
+    simd_type k_prev{}; // used only if tail!=0
+    simd_type k_sym{0};
+
+    // i traverses left half in SIMD chunks; offset mirrors to right side
+    static_loop<0, end_idx, simd_size>([&]([[maybe_unused]] auto I) {
+      constexpr std::int64_t i64    = decltype(I)::value;
+      constexpr std::uint8_t i      = static_cast<std::uint8_t>(i64);
+      constexpr std::uint8_t offset = static_cast<std::uint8_t>(offset_start) - i;
+
+      // Build even/odd Horner lanes (compile-time unrolled)
+      simd_type k_odd = [i]() {
         if constexpr (if_odd_degree) {
-          return simd_type::load_aligned(padded_coeffs[0].data() + i);
+          return simd_type::load_aligned(
+              padded_coeffs[0].data() + static_cast<std::size_t>(i));
         } else {
           return simd_type{0};
         }
       }();
-      auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
-      for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2) {
-        const auto cji_odd  = simd_type::load_aligned(padded_coeffs[j].data() + i);
-        const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
-        k_odd               = xsimd::fma(k_odd, z2v, cji_odd);
-        k_even              = xsimd::fma(k_even, z2v, cji_even);
-      }
-      // left part
-      xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
-      // right part symmetric to the left part
-      if (offset >= end_idx) {
+      simd_type k_even = simd_type::load_aligned(
+          padded_coeffs[if_odd_degree].data() + static_cast<std::size_t>(i));
+
+      // j runs over coefficient pairs (odd/even), step 2
+      static_loop<1 + if_odd_degree, static_cast<std::int64_t>(nc), 2>([&](auto J) {
+        constexpr auto j   = static_cast<std::size_t>(decltype(J)::value);
+        const auto cji_odd = simd_type::load_aligned(
+            padded_coeffs[j].data() + static_cast<std::size_t>(i));
+        const auto cji_even = simd_type::load_aligned(
+            padded_coeffs[j + 1].data() + static_cast<std::size_t>(i));
+        k_odd  = xsimd::fma(k_odd, z2v, cji_odd);
+        k_even = xsimd::fma(k_even, z2v, cji_even);
+      });
+
+      // left side
+      xsimd::fma(k_odd, zv, k_even).store_aligned(ker + static_cast<std::size_t>(i));
+
+      // mirrored right side
+      if constexpr (offset >= end_idx) {
         if constexpr (tail) {
-          // to use aligned store, we need shuffle the previous k_sym and current k_sym
           k_prev = k_sym;
           k_sym  = xsimd::fnma(k_odd, zv, k_even);
-          xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset);
+          xsimd::shuffle(k_sym, k_prev, shuffle_batch)
+              .store_aligned(ker + static_cast<std::size_t>(offset));
         } else {
           xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), shuffle_batch)
-              .store_aligned(ker + offset);
+              .store_aligned(ker + static_cast<std::size_t>(offset));
         }
       }
-    }
+    });
+
   } else {
-    const simd_type zv(z);
-    for (uint8_t i = 0; i < w; i += simd_size) {
+    // ---------------- Straight SIMD blocks (compile-time) ----------------
+    static_loop<0, w, simd_size>([&](auto I) {
+      constexpr std::int64_t i64 = decltype(I)::value;
+      constexpr std::size_t i    = static_cast<std::size_t>(i64);
+
       auto k = simd_type::load_aligned(padded_coeffs[0].data() + i);
-      for (uint8_t j = 1; j < nc; ++j) {
-        const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i);
-        k              = xsimd::fma(k, zv, cji);
-      }
+
+      static_loop<1, static_cast<std::int64_t>(nc), 1>([&](auto J) {
+        constexpr std::size_t j = static_cast<std::size_t>(decltype(J)::value);
+        const auto cji          = simd_type::load_aligned(padded_coeffs[j].data() + i);
+        k                       = xsimd::fma(k, zv, cji);
+      });
+
       k.store_aligned(ker + i);
-    }
+    });
   }
 }
-
 template<typename T, uint8_t ns>
 static void interp_line_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *ker,
                              const BIGINT i1, const UBIGINT N1) {
@@ -477,11 +617,9 @@ static void interp_line(T *FINUFFT_RESTRICT target, const T *du, const T *ker, B
       // optimize the code better
       return res_low + res_hi;
     }();
-    const auto res_array = xsimd_to_array(res);
-    for (uint8_t i{0}; i < simd_size; i += 2) {
-      out[0] += res_array[i];
-      out[1] += res_array[i + 1];
-    }
+    const auto sum = chsum(res); // this is the SIMD channel sum
+    out[0] += sum[0];
+    out[1] += sum[1];
     // this is where the code differs from spread_kernel, the interpolator does an extra
     // reduction step to SIMD elements down to 2 elements
     // This is known as horizontal sum in SIMD terminology
@@ -633,11 +771,9 @@ static void interp_square(T *FINUFFT_RESTRICT target, const T *du, const T *ker1
       }
       return res_low + res_hi;
     }();
-    const auto res_array = xsimd_to_array(res);
-    for (uint8_t i{0}; i < simd_size; i += 2) {
-      out[0] += res_array[i];
-      out[1] += res_array[i + 1];
-    }
+    const auto sum = chsum(res); // this is the SIMD channel sum
+    out[0] += sum[0];
+    out[1] += sum[1];
   } else { // wraps somewhere: use ptr list
     // this is slower than above, but occurs much less often, with fractional
     // rate O(ns/min(N1,N2)). Thus this code doesn't need to be so optimized.
@@ -799,11 +935,9 @@ static void interp_cube(T *FINUFFT_RESTRICT target, const T *du, const T *ker1,
       }
       return res_low + res_hi;
     }();
-    const auto res_array = xsimd_to_array(res);
-    for (uint8_t i{0}; i < simd_size; i += 2) {
-      out[0] += res_array[i];
-      out[1] += res_array[i + 1];
-    }
+    const auto sum = chsum(res); // this is the SIMD channel sum
+    out[0] += sum[0];
+    out[1] += sum[1];
   } else {
     return interp_cube_wrapped<T, ns, simd_type>(target, du, ker1, ker2, ker3, i1, i2, i3,
                                                  N1, N2, N3);
@@ -812,7 +946,391 @@ static void interp_cube(T *FINUFFT_RESTRICT target, const T *du, const T *ker1,
   target[1] = out[1];
 }
 
-template<uint8_t ns, uint8_t kerevalmeth, class T,
+// -----------------------------------------------------------------------------
+// Scalar Horner evaluator (no xsimd).
+// -----------------------------------------------------------------------------
+template<typename T, std::uint8_t w, std::uint8_t upsampfact>
+static FINUFFT_ALWAYS_INLINE void eval_kernel_Horner(
+    T *FINUFFT_RESTRICT ker, T x, const finufft_spread_opts & /*opts*/) noexcept {
+  // Map local grid offset into [-1, 1]
+  const T z = std::fma(T(2), x, T(w - 1));
+
+  // Select coefficient set at compile time
+  constexpr auto horner_coeffs = []() constexpr noexcept {
+    if constexpr (upsampfact == 200) {
+      return get_horner_coeffs_200<T, w>();
+    } else {
+      static_assert(upsampfact == 125, "Unsupported upsampfact");
+      return get_horner_coeffs_125<T, w>();
+    }
+  }();
+  constexpr std::size_t nc = horner_coeffs.size();
+
+  // Plain scalar Horner per tap
+  for (std::size_t i = 0; i < w; ++i) {
+    T k = horner_coeffs[0][i];
+    for (std::size_t j = 1; j < nc; ++j) k = std::fma(k, z, horner_coeffs[j][i]);
+    ker[i] = k;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Utility: evaluate 1D/2D/3D kernels into contiguous blocks, scalar path only.
+// Writes each kernel (ns entries) into ker + i*MAX_NSPREAD.
+// -----------------------------------------------------------------------------
+template<int ns, int kerevalmeth, class T, typename... V>
+static FINUFFT_ALWAYS_INLINE T *ker_eval_scalar(
+    T *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, const V... elems) noexcept {
+  static_assert(ns > 0 && ns <= MAX_NSPREAD, "ns out of range");
+  const std::array<T, sizeof...(elems)> inputs{T(elems)...};
+
+  for (std::size_t i = 0; i < inputs.size(); ++i) {
+    if constexpr (kerevalmeth == 1) {
+      // Horner polynomial method (scalar)
+      if (opts.upsampfac == 2.0) {
+        eval_kernel_Horner<T, static_cast<std::uint8_t>(ns), 200>(ker + i * MAX_NSPREAD,
+                                                                  inputs[i], opts);
+      }
+      if (opts.upsampfac == 1.25) {
+        eval_kernel_Horner<T, static_cast<std::uint8_t>(ns), 125>(ker + i * MAX_NSPREAD,
+                                                                  inputs[i], opts);
+      }
+    }
+    if constexpr (kerevalmeth == 0) {
+      // "set_kernel_args + evaluate_vector" path
+      std::array<T, MAX_NSPREAD> kernel_args{};
+      set_kernel_args<T, ns>(kernel_args.data(), inputs[i]);
+      evaluate_kernel_vector<T, ns>(ker + i * MAX_NSPREAD, kernel_args.data(), opts);
+    }
+  }
+  return ker;
+}
+
+// -----------------------------------------------------------------------------
+// 1D spread: NU -> uniform subgrid (no wrapping inside subproblem)
+// -----------------------------------------------------------------------------
+template<typename T, std::uint8_t ns, int kerevalmeth>
+FINUFFT_NEVER_INLINE void spread_subproblem_1d_scalar_kernel(
+    BIGINT off1, BIGINT size1, T *FINUFFT_RESTRICT du, BIGINT M,
+    const T *FINUFFT_RESTRICT kx, const T *FINUFFT_RESTRICT dd,
+    const finufft_spread_opts &opts) {
+  static_assert(ns > 0 && ns <= MAX_NSPREAD, "ns out of range");
+
+  const T ns2 = T(ns) / T(2);
+
+  // zero output subgrid
+  for (BIGINT i = 0; i < 2 * size1; ++i) du[i] = T(0);
+
+  // local storage for kernel values (1D): uses a MAX_NSPREAD stride
+  T kernel_values[MAX_NSPREAD];
+
+  for (BIGINT i = 0; i < M; ++i) {
+    const T re0 = dd[2 * i];
+    const T im0 = dd[2 * i + 1];
+
+    // start index and relative offset
+    BIGINT i1 = (BIGINT)std::ceil(kx[i] - ns2);
+    T x1      = T(i1) - kx[i];
+
+    // clipping safeguard (matches original behavior)
+    if (x1 < -ns2) x1 = -ns2;
+    if (x1 > -ns2 + 1) x1 = -ns2 + 1;
+
+    // evaluate kernel for x1 into kernel_values[0..ns-1]
+    ker_eval_scalar<ns, kerevalmeth, T>(kernel_values, opts, x1);
+
+    // accumulate into subgrid
+    BIGINT j = i1 - off1;
+    for (int dx = 0; dx < int(ns); ++dx) {
+      const T k = kernel_values[dx];
+      du[2 * j] += re0 * k;
+      du[2 * j + 1] += im0 * k;
+      ++j;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// 2D spread: NU -> uniform subgrid (no wrapping inside subproblem)
+// -----------------------------------------------------------------------------
+template<typename T, std::uint8_t ns, int kerevalmeth>
+FINUFFT_NEVER_INLINE void spread_subproblem_2d_scalar_kernel(
+    BIGINT off1, BIGINT off2, BIGINT size1, BIGINT size2, T *FINUFFT_RESTRICT du,
+    BIGINT M, const T *FINUFFT_RESTRICT kx, const T *FINUFFT_RESTRICT ky,
+    const T *FINUFFT_RESTRICT dd, const finufft_spread_opts &opts) {
+  static_assert(ns > 0 && ns <= MAX_NSPREAD, "ns out of range");
+
+  const T ns2 = T(ns) / T(2);
+
+  // zero output subgrid
+  for (BIGINT i = 0; i < 2 * size1 * size2; ++i) du[i] = T(0);
+
+  // kernel values stored in consecutive MAX_NSPREAD-chunked blocks: [ker1 | ker2]
+  T kernel_values[2 * MAX_NSPREAD];
+  T *ker1 = kernel_values;
+  T *ker2 = kernel_values + MAX_NSPREAD;
+
+  // pre-mul buffer for complex (re,im)
+  T ker1val[2 * ns];
+
+  for (BIGINT i = 0; i < M; ++i) {
+    const T re0 = dd[2 * i];
+    const T im0 = dd[2 * i + 1];
+
+    BIGINT i1  = (BIGINT)std::ceil(kx[i] - ns2);
+    BIGINT i2  = (BIGINT)std::ceil(ky[i] - ns2);
+    const T x1 = T(i1) - kx[i];
+    const T x2 = T(i2) - ky[i];
+
+    // evaluate both dims at once
+    ker_eval_scalar<ns, kerevalmeth, T>(kernel_values, opts, x1, x2);
+
+    // combine ker1 with source to reduce FLOPs in inner loop
+    for (int t = 0; t < int(ns); ++t) {
+      ker1val[2 * t]     = re0 * ker1[t];
+      ker1val[2 * t + 1] = im0 * ker1[t];
+    }
+
+    // critical inner loop
+    for (int dy = 0; dy < int(ns); ++dy) {
+      const BIGINT j = size1 * (i2 - off2 + dy) + (i1 - off1);
+      const T kerval = ker2[dy];
+      T *trg         = du + 2 * j;
+      for (int dx = 0; dx < int(2 * ns); ++dx) {
+        trg[dx] += kerval * ker1val[dx];
+      }
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// 3D spread: NU -> uniform subgrid (no wrapping inside subproblem)
+// -----------------------------------------------------------------------------
+template<typename T, std::uint8_t ns, int kerevalmeth>
+FINUFFT_NEVER_INLINE void spread_subproblem_3d_scalar_kernel(
+    BIGINT off1, BIGINT off2, BIGINT off3, BIGINT size1, BIGINT size2, BIGINT size3,
+    T *FINUFFT_RESTRICT du, BIGINT M, const T *FINUFFT_RESTRICT kx,
+    const T *FINUFFT_RESTRICT ky, const T *FINUFFT_RESTRICT kz,
+    const T *FINUFFT_RESTRICT dd, const finufft_spread_opts &opts) {
+  static_assert(ns > 0 && ns <= MAX_NSPREAD, "ns out of range");
+
+  const T ns2 = T(ns) / T(2);
+
+  // zero output subgrid
+  for (BIGINT i = 0; i < 2 * size1 * size2 * size3; ++i) du[i] = T(0);
+
+  // kernel values stored in consecutive MAX_NSPREAD-chunked blocks: [ker1 | ker2 | ker3]
+  T kernel_values[3 * MAX_NSPREAD];
+  T *ker1 = kernel_values + 0 * MAX_NSPREAD;
+  T *ker2 = kernel_values + 1 * MAX_NSPREAD;
+  T *ker3 = kernel_values + 2 * MAX_NSPREAD;
+
+  // pre-mul buffer for complex (re,im)
+  T ker1val[2 * ns];
+
+  for (BIGINT i = 0; i < M; ++i) {
+    const T re0 = dd[2 * i];
+    const T im0 = dd[2 * i + 1];
+
+    BIGINT i1 = (BIGINT)std::ceil(kx[i] - ns2);
+    BIGINT i2 = (BIGINT)std::ceil(ky[i] - ns2);
+    BIGINT i3 = (BIGINT)std::ceil(kz[i] - ns2);
+
+    const T x1 = T(i1) - kx[i];
+    const T x2 = T(i2) - ky[i];
+    const T x3 = T(i3) - kz[i];
+
+    // evaluate all three dims at once
+    ker_eval_scalar<ns, kerevalmeth, T>(kernel_values, opts, x1, x2, x3);
+
+    // combine ker1 with source to reduce FLOPs in inner loops
+    for (int t = 0; t < int(ns); ++t) {
+      ker1val[2 * t]     = re0 * ker1[t];
+      ker1val[2 * t + 1] = im0 * ker1[t];
+    }
+
+    // critical inner loops
+    for (int dz = 0; dz < int(ns); ++dz) {
+      const BIGINT oz = size1 * size2 * (i3 - off3 + dz);
+      for (int dy = 0; dy < int(ns); ++dy) {
+        const BIGINT j = oz + size1 * (i2 - off2 + dy) + (i1 - off1);
+        const T kerval = ker2[dy] * ker3[dz];
+        T *trg         = du + 2 * j;
+        for (int dx = 0; dx < int(2 * ns); ++dx) {
+          trg[dx] += kerval * ker1val[dx];
+        }
+      }
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// 1D interp: uniform subgrid -> NU (with wrapping)
+// -----------------------------------------------------------------------------
+template<typename T, uint8_t ns>
+FINUFFT_NEVER_INLINE void interp_line_scalar_kernel(T *target, const T *du, const T *ker,
+                                                    BIGINT i1, BIGINT N1) {
+  static_assert(ns > 0, "ns must be positive");
+  static_assert(ns <= MAX_NSPREAD, "ns exceeds MAX_NSPREAD");
+
+  T out[2] = {T(0), T(0)};
+  BIGINT j = i1;
+
+  if (i1 < 0) { // wraps at left
+    j += N1;
+    for (int dx = 0; dx < -i1; ++dx) {
+      out[0] += du[2 * j] * ker[dx];
+      out[1] += du[2 * j + 1] * ker[dx];
+      ++j;
+    }
+    j -= N1;
+    for (int dx = -i1; dx < int(ns); ++dx) {
+      out[0] += du[2 * j] * ker[dx];
+      out[1] += du[2 * j + 1] * ker[dx];
+      ++j;
+    }
+  } else if (i1 + ns >= N1) { // wraps at right
+    for (int dx = 0; dx < int(N1 - i1); ++dx) {
+      out[0] += du[2 * j] * ker[dx];
+      out[1] += du[2 * j + 1] * ker[dx];
+      ++j;
+    }
+    j -= N1;
+    for (int dx = int(N1 - i1); dx < int(ns); ++dx) {
+      out[0] += du[2 * j] * ker[dx];
+      out[1] += du[2 * j + 1] * ker[dx];
+      ++j;
+    }
+  } else { // no wrapping
+    for (int dx = 0; dx < int(ns); ++dx) {
+      out[0] += du[2 * j] * ker[dx];
+      out[1] += du[2 * j + 1] * ker[dx];
+      ++j;
+    }
+  }
+
+  target[0] = out[0];
+  target[1] = out[1];
+}
+
+// -----------------------------------------------------------------------------
+// 2D interp: uniform subgrid -> NU (with wrapping)
+// -----------------------------------------------------------------------------
+template<typename T, uint8_t ns>
+FINUFFT_NEVER_INLINE void interp_square_scalar_kernel(
+    T *target, const T *du, const T *ker1, const T *ker2, BIGINT i1, BIGINT i2, BIGINT N1,
+    BIGINT N2) {
+  static_assert(ns > 0, "ns must be positive");
+  static_assert(ns <= MAX_NSPREAD, "ns exceeds MAX_NSPREAD");
+
+  T out[2] = {T(0), T(0)};
+
+  if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) {
+    // no wrapping: SIMD/FMA-friendly path
+    T line[2 * ns]; // interleaved real/imag
+    {
+      const T *lptr = du + 2 * (N1 * i2 + i1);
+      for (int l = 0; l < int(2 * ns); ++l) line[l] = ker2[0] * lptr[l];
+    }
+    for (int dy = 1; dy < int(ns); ++dy) {
+      const T *lptr = du + 2 * (N1 * (i2 + dy) + i1);
+      for (int l = 0; l < int(2 * ns); ++l) line[l] += ker2[dy] * lptr[l];
+    }
+    for (int dx = 0; dx < int(ns); ++dx) {
+      out[0] += line[2 * dx] * ker1[dx];
+      out[1] += line[2 * dx + 1] * ker1[dx];
+    }
+  } else {
+    // wrapping path (rare)
+    BIGINT j1[ns], j2[ns];
+    BIGINT x = i1, y = i2;
+    for (int d = 0; d < int(ns); ++d) {
+      if (x < 0) x += N1;
+      if (x >= N1) x -= N1;
+      j1[d] = x++;
+      if (y < 0) y += N2;
+      if (y >= N2) y -= N2;
+      j2[d] = y++;
+    }
+    for (int dy = 0; dy < int(ns); ++dy) {
+      const BIGINT oy = N1 * j2[dy];
+      for (int dx = 0; dx < int(ns); ++dx) {
+        const T k      = ker1[dx] * ker2[dy];
+        const BIGINT j = oy + j1[dx];
+        out[0] += du[2 * j] * k;
+        out[1] += du[2 * j + 1] * k;
+      }
+    }
+  }
+
+  target[0] = out[0];
+  target[1] = out[1];
+}
+
+// -----------------------------------------------------------------------------
+// 3D interp: uniform subgrid -> NU (with wrapping)
+// -----------------------------------------------------------------------------
+template<typename T, uint8_t ns>
+FINUFFT_NEVER_INLINE void interp_cube_scalar_kernel(
+    T *target, const T *du, const T *ker1, const T *ker2, const T *ker3, BIGINT i1,
+    BIGINT i2, BIGINT i3, BIGINT N1, BIGINT N2, BIGINT N3) {
+  static_assert(ns > 0, "ns must be positive");
+  static_assert(ns <= MAX_NSPREAD, "ns exceeds MAX_NSPREAD");
+
+  T out[2] = {T(0), T(0)};
+
+  if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2 && i3 >= 0 && i3 + ns <= N3) {
+    // no wrapping: SIMD/FMA-friendly path
+    T line[2 * ns];
+    for (int l = 0; l < int(2 * ns); ++l) line[l] = T(0);
+
+    for (int dz = 0; dz < int(ns); ++dz) {
+      const BIGINT oz = N1 * N2 * (i3 + dz);
+      for (int dy = 0; dy < int(ns); ++dy) {
+        const T *lptr = du + 2 * (oz + N1 * (i2 + dy) + i1);
+        const T ker23 = ker2[dy] * ker3[dz];
+        for (int l = 0; l < int(2 * ns); ++l) line[l] += lptr[l] * ker23;
+      }
+    }
+    for (int dx = 0; dx < int(ns); ++dx) {
+      out[0] += line[2 * dx] * ker1[dx];
+      out[1] += line[2 * dx + 1] * ker1[dx];
+    }
+  } else {
+    // wrapping path (rare)
+    BIGINT j1[ns], j2[ns], j3[ns];
+    BIGINT x = i1, y = i2, z = i3;
+    for (int d = 0; d < int(ns); ++d) {
+      if (x < 0) x += N1;
+      if (x >= N1) x -= N1;
+      j1[d] = x++;
+      if (y < 0) y += N2;
+      if (y >= N2) y -= N2;
+      j2[d] = y++;
+      if (z < 0) z += N3;
+      if (z >= N3) z -= N3;
+      j3[d] = z++;
+    }
+    for (int dz = 0; dz < int(ns); ++dz) {
+      const BIGINT oz = N1 * N2 * j3[dz];
+      for (int dy = 0; dy < int(ns); ++dy) {
+        const BIGINT oy = oz + N1 * j2[dy];
+        const T ker23   = ker2[dy] * ker3[dz];
+        for (int dx = 0; dx < int(ns); ++dx) {
+          const T k      = ker1[dx] * ker23;
+          const BIGINT j = oy + j1[dx];
+          out[0] += du[2 * j] * k;
+          out[1] += du[2 * j + 1] * k;
+        }
+      }
+    }
+  }
+
+  target[0] = out[0];
+  target[1] = out[1];
+}
+
+template<int ns, int kerevalmeth, class T,
          class simd_type = xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, ns>()>,
          typename... V>
 static FINUFFT_ALWAYS_INLINE auto ker_eval(
@@ -856,7 +1374,7 @@ static FINUFFT_ALWAYS_INLINE auto ker_eval(
   return ker;
 }
 
-template<typename T, uint8_t ns, bool kerevalmeth>
+template<typename T, int ns, int kerevalmeth>
 FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel(
     const BIGINT off1, const UBIGINT size1, T *FINUFFT_RESTRICT du, const UBIGINT M,
     const T *const kx, const T *const dd, const finufft_spread_opts &opts) noexcept {
@@ -996,77 +1514,39 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel(
   }
 }
 
-template<typename T, uint8_t NS>
-static void spread_subproblem_1d_dispatch(
-    const BIGINT off1, const UBIGINT size1, T *FINUFFT_RESTRICT du, const UBIGINT M,
-    const T *kx, const T *dd, const finufft_spread_opts &opts) noexcept {
-  /* this is a dispatch function that will call the correct kernel based on the ns
-   it recursively iterates from MAX_NSPREAD to MIN_NSPREAD
-   it generates the following code:
-   if (ns == MAX_NSPREAD) {
-     if (opts.kerevalmeth) {
-       return spread_subproblem_1d_kernel<MAX_NSPREAD, true>(off1, size1, du, M, kx, dd,
-       opts);
-    } else {
-       return spread_subproblem_1d_kernel<MAX_NSPREAD, false>(off1, size1, du, M, kx, dd,
-       opts);
-   }
-   if (ns == MAX_NSPREAD-1) {
-     if (opts.kerevalmeth) {
-       return spread_subproblem_1d_kernel<MAX_NSPREAD-1, true>(off1, size1, du, M, kx, dd,
-       opts);
-     } else {
-       return spread_subproblem_1d_kernel<MAX_NSPREAD-1, false>(off1, size1, du, M, kx,
-       dd, opts);
-     }
-   }
-   ...
-   NOTE: using a big MAX_NSPREAD will generate a lot of code
-         if MAX_NSPREAD gets too large it will crash the compiler with a compile time
-         stack overflow. Older compiler will just throw an internal error without
-         providing any useful information on the error.
-         This is a known issue with template metaprogramming.
-         If you increased MAX_NSPREAD and the code does not compile, try reducing it.
-  */
-  static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD,
-                "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)");
-  if constexpr (NS == MIN_NSPREAD) { // Base case
-    if (opts.kerevalmeth)
-      return spread_subproblem_1d_kernel<T, MIN_NSPREAD, true>(off1, size1, du, M, kx, dd,
-                                                               opts);
-    else {
-      return spread_subproblem_1d_kernel<T, MIN_NSPREAD, false>(off1, size1, du, M, kx,
-                                                                dd, opts);
-    }
-  } else {
-    if (opts.nspread == NS) {
-      if (opts.kerevalmeth) {
-        return spread_subproblem_1d_kernel<T, NS, true>(off1, size1, du, M, kx, dd, opts);
-      } else {
-        return spread_subproblem_1d_kernel<T, NS, false>(off1, size1, du, M, kx, dd,
-                                                         opts);
-      }
-    } else {
-      return spread_subproblem_1d_dispatch<T, NS - 1>(off1, size1, du, M, kx, dd, opts);
-    }
+namespace {
+
+template<typename T> struct SpreadSubproblem1dCaller {
+  BIGINT off1;
+  UBIGINT size1;
+  T *du;
+  UBIGINT M;
+  const T *kx;
+  const T *dd;
+  const finufft_spread_opts &opts;
+  template<int NS, int KM> int operator()() const {
+    if (opts.simd == 1)
+      spread_subproblem_1d_scalar_kernel<T, NS, KM>(off1, size1, du, M, kx, dd, opts);
+    else
+      spread_subproblem_1d_kernel<T, NS, KM>(off1, size1, du, M, kx, dd, opts);
+    return 0;
   }
-}
+};
+
+} // namespace
 
 template<typename T>
 static void spread_subproblem_1d(BIGINT off1, UBIGINT size1, T *du, UBIGINT M, T *kx,
-                                 T *dd, const finufft_spread_opts &opts) noexcept
-/* spreader from dd (NU) to du (uniform) in 2D without wrapping.
-   See above docs/notes for spread_subproblem_2d.
-   kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims.
-   dd (size M complex) are complex source strengths
-   du (size size1*size2) is complex uniform output array
-   For algoritmic details see spread_subproblem_1d_kernel.
-*/
-{
-  spread_subproblem_1d_dispatch<T, MAX_NSPREAD>(off1, size1, du, M, kx, dd, opts);
+                                 T *dd, const finufft_spread_opts &opts) noexcept {
+  SpreadSubproblem1dCaller<T> caller{off1, size1, du, M, kx, dd, opts};
+  using NsSeq  = make_range<MIN_NSPREAD, MAX_NSPREAD>;
+  using KerSeq = std::make_integer_sequence<int, KEREVAL_METHODS>;
+  auto params  = std::make_tuple(DispatchParam<NsSeq>{opts.nspread},
+                                 DispatchParam<KerSeq>{opts.kerevalmeth});
+  dispatch(caller, params);
 }
 
-template<typename T, uint8_t ns, bool kerevalmeth>
+template<typename T, int ns, int kerevalmeth>
 FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel(
     const BIGINT off1, const BIGINT off2, const UBIGINT size1, const UBIGINT size2,
     T *FINUFFT_RESTRICT du, const UBIGINT M, const T *kx, const T *ky, const T *dd,
@@ -1159,36 +1639,29 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel(
   }
 }
 
-template<typename T, uint8_t NS>
-void spread_subproblem_2d_dispatch(
-    const BIGINT off1, const BIGINT off2, const UBIGINT size1, const UBIGINT size2,
-    T *FINUFFT_RESTRICT du, const UBIGINT M, const T *kx, const T *ky, const T *dd,
-    const finufft_spread_opts &opts) {
-  static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD,
-                "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)");
-  if constexpr (NS == MIN_NSPREAD) { // Base case
-    if (opts.kerevalmeth)
-      return spread_subproblem_2d_kernel<T, MIN_NSPREAD, true>(off1, off2, size1, size2,
-                                                               du, M, kx, ky, dd, opts);
-    else {
-      return spread_subproblem_2d_kernel<T, MIN_NSPREAD, false>(off1, off2, size1, size2,
-                                                                du, M, kx, ky, dd, opts);
-    }
-  } else {
-    if (opts.nspread == NS) {
-      if (opts.kerevalmeth) {
-        return spread_subproblem_2d_kernel<T, NS, true>(off1, off2, size1, size2, du, M,
-                                                        kx, ky, dd, opts);
-      } else {
-        return spread_subproblem_2d_kernel<T, NS, false>(off1, off2, size1, size2, du, M,
-                                                         kx, ky, dd, opts);
-      }
-    } else {
-      return spread_subproblem_2d_dispatch<T, NS - 1>(off1, off2, size1, size2, du, M, kx,
-                                                      ky, dd, opts);
-    }
+namespace {
+
+template<typename T> struct SpreadSubproblem2dCaller {
+  BIGINT off1, off2;
+  UBIGINT size1, size2;
+  T *du;
+  UBIGINT M;
+  const T *kx;
+  const T *ky;
+  const T *dd;
+  const finufft_spread_opts &opts;
+  template<int NS, int KM> int operator()() const {
+    if (opts.simd == 1)
+      spread_subproblem_2d_scalar_kernel<T, NS, KM>(off1, off2, size1, size2, du, M, kx,
+                                                    ky, dd, opts);
+    else
+      spread_subproblem_2d_kernel<T, NS, KM>(off1, off2, size1, size2, du, M, kx, ky, dd,
+                                             opts);
+    return 0;
   }
-}
+};
+
+} // namespace
 
 template<typename T>
 static void spread_subproblem_2d(BIGINT off1, BIGINT off2, UBIGINT size1, UBIGINT size2,
@@ -1203,11 +1676,15 @@ static void spread_subproblem_2d(BIGINT off1, BIGINT off2, UBIGINT size1, UBIGIN
    For algoritmic details see spread_subproblem_1d_kernel.
 */
 {
-  spread_subproblem_2d_dispatch<T, MAX_NSPREAD>(off1, off2, size1, size2, du, M, kx, ky,
-                                                dd, opts);
+  SpreadSubproblem2dCaller<T> caller{off1, off2, size1, size2, du, M, kx, ky, dd, opts};
+  using NsSeq  = make_range<MIN_NSPREAD, MAX_NSPREAD>;
+  using KerSeq = std::make_integer_sequence<int, KEREVAL_METHODS>;
+  auto params  = std::make_tuple(DispatchParam<NsSeq>{opts.nspread},
+                                 DispatchParam<KerSeq>{opts.kerevalmeth});
+  dispatch(caller, params);
 }
 
-template<typename T, uint8_t ns, bool kerevalmeth>
+template<typename T, int ns, int kerevalmeth>
 FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel(
     const BIGINT off1, const BIGINT off2, const BIGINT off3, const UBIGINT size1,
     const UBIGINT size2, const UBIGINT size3, T *FINUFFT_RESTRICT du, const UBIGINT M,
@@ -1283,36 +1760,30 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel(
   }
 }
 
-template<typename T, uint8_t NS>
-void spread_subproblem_3d_dispatch(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1,
-                                   UBIGINT size2, UBIGINT size3, T *du, UBIGINT M,
-                                   const T *kx, const T *ky, const T *kz, const T *dd,
-                                   const finufft_spread_opts &opts) noexcept {
-  static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD,
-                "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)");
-  if constexpr (NS == MIN_NSPREAD) { // Base case
-    if (opts.kerevalmeth)
-      return spread_subproblem_3d_kernel<T, MIN_NSPREAD, true>(
-          off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts);
-    else {
-      return spread_subproblem_3d_kernel<T, MIN_NSPREAD, false>(
-          off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts);
-    }
-  } else {
-    if (opts.nspread == NS) {
-      if (opts.kerevalmeth) {
-        return spread_subproblem_3d_kernel<T, NS, true>(
-            off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts);
-      } else {
-        return spread_subproblem_3d_kernel<T, NS, false>(
-            off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts);
-      }
-    } else {
-      return spread_subproblem_3d_dispatch<T, NS - 1>(off1, off2, off3, size1, size2,
-                                                      size3, du, M, kx, ky, kz, dd, opts);
-    }
+namespace {
+
+template<typename T> struct SpreadSubproblem3dCaller {
+  BIGINT off1, off2, off3;
+  UBIGINT size1, size2, size3;
+  T *du;
+  UBIGINT M;
+  T *kx;
+  T *ky;
+  T *kz;
+  T *dd;
+  const finufft_spread_opts &opts;
+  template<int NS, int KM> int operator()() const {
+    if (opts.simd == 1)
+      spread_subproblem_3d_scalar_kernel<T, NS, KM>(off1, off2, off3, size1, size2, size3,
+                                                    du, M, kx, ky, kz, dd, opts);
+    else
+      spread_subproblem_3d_kernel<T, NS, KM>(off1, off2, off3, size1, size2, size3, du, M,
+                                             kx, ky, kz, dd, opts);
+    return 0;
   }
-}
+};
+
+} // namespace
 
 template<typename T>
 static void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1,
@@ -1326,8 +1797,13 @@ dd (size M complex) are complex source strengths
 du (size size1*size2*size3) is uniform complex output array
 */
 {
-  spread_subproblem_3d_dispatch<T, MAX_NSPREAD>(off1, off2, off3, size1, size2, size3, du,
-                                                M, kx, ky, kz, dd, opts);
+  SpreadSubproblem3dCaller<T> caller{off1, off2, off3, size1, size2, size3, du,
+                                     M,    kx,   ky,   kz,    dd,    opts};
+  using NsSeq  = make_range<MIN_NSPREAD, MAX_NSPREAD>;
+  using KerSeq = std::make_integer_sequence<int, KEREVAL_METHODS>;
+  auto params  = std::make_tuple(DispatchParam<NsSeq>{opts.nspread},
+                                 DispatchParam<KerSeq>{opts.kerevalmeth});
+  dispatch(caller, params);
 }
 
 template<typename T, bool thread_safe>
@@ -1467,11 +1943,10 @@ static void bin_sort_singlethread(std::vector<BIGINT> &ret, UBIGINT M, const T *
 }
 
 template<typename T>
-static void bin_sort_multithread(std::vector<BIGINT> &ret, UBIGINT M, T *kx, T *ky,
-                                 T *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3,
-                                 double bin_size_x, double bin_size_y,
-                                 double bin_size_z, int debug [[maybe_unused]],
-                                 int nthr)
+static void bin_sort_multithread(std::vector<BIGINT> &ret, UBIGINT M, T *kx, T *ky, T *kz,
+                                 UBIGINT N1, UBIGINT N2, UBIGINT N3, double bin_size_x,
+                                 double bin_size_y, double bin_size_z,
+                                 int debug [[maybe_unused]], int nthr)
 /* Mostly-OpenMP'ed version of bin_sort.
    For documentation see: bin_sort_singlethread.
    Caution: when M (# NU pts) << N (# U pts), is SLOWER than single-thread.
@@ -1943,7 +2418,7 @@ static int spreadSorted(const std::vector<BIGINT> &sort_indices, UBIGINT N1, UBI
 };
 
 // --------------------------------------------------------------------------
-template<typename T, uint16_t ns, uint16_t kerevalmeth>
+template<typename T, int ns, int kerevalmeth>
 FINUFFT_NEVER_INLINE static int interpSorted_kernel(
     const std::vector<BIGINT> &sort_indices, const UBIGINT N1, const UBIGINT N2,
     const UBIGINT N3, const T *data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx,
@@ -2015,25 +2490,47 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel(
 
         // eval kernel values patch and use to interpolate from uniform data...
         if (!(opts.flags & TF_OMIT_SPREADING)) {
-          switch (ndims) {
-          case 1:
-            ker_eval<ns, kerevalmeth, T, simd_type>(kernel_values.data(), opts, x1);
-            interp_line<T, ns, simd_type>(target, data_uniform, ker1, i1, N1);
-            break;
-          case 2:
-            ker_eval<ns, kerevalmeth, T, simd_type>(kernel_values.data(), opts, x1, x2);
-            interp_square<T, ns, simd_type>(target, data_uniform, ker1, ker2, i1, i2, N1,
-                                            N2);
-            break;
-          case 3:
-            ker_eval<ns, kerevalmeth, T, simd_type>(kernel_values.data(), opts, x1, x2,
-                                                    x3);
-            interp_cube<T, ns, simd_type>(target, data_uniform, ker1, ker2, ker3, i1, i2,
-                                          i3, N1, N2, N3);
-            break;
-          default: // can't get here
-            FINUFFT_UNREACHABLE;
-            break;
+          if (opts.simd == 1) {
+            switch (ndims) {
+            case 1:
+              ker_eval_scalar<ns, kerevalmeth, T>(kernel_values.data(), opts, x1);
+              interp_line_scalar_kernel<T, ns>(target, data_uniform, ker1, i1, N1);
+              break;
+            case 2:
+              ker_eval_scalar<ns, kerevalmeth, T>(kernel_values.data(), opts, x1, x2);
+              interp_square_scalar_kernel<T, ns>(target, data_uniform, ker1, ker2, i1, i2,
+                                                 N1, N2);
+              break;
+            case 3:
+              ker_eval_scalar<ns, kerevalmeth, T>(kernel_values.data(), opts, x1, x2, x3);
+              interp_cube_scalar_kernel<T, ns>(target, data_uniform, ker1, ker2, ker3, i1,
+                                               i2, i3, N1, N2, N3);
+              break;
+            default: // can't get here
+              FINUFFT_UNREACHABLE;
+              break;
+            }
+          } else {
+            switch (ndims) {
+            case 1:
+              ker_eval<ns, kerevalmeth, T, simd_type>(kernel_values.data(), opts, x1);
+              interp_line<T, ns, simd_type>(target, data_uniform, ker1, i1, N1);
+              break;
+            case 2:
+              ker_eval<ns, kerevalmeth, T, simd_type>(kernel_values.data(), opts, x1, x2);
+              interp_square<T, ns, simd_type>(target, data_uniform, ker1, ker2, i1, i2,
+                                              N1, N2);
+              break;
+            case 3:
+              ker_eval<ns, kerevalmeth, T, simd_type>(kernel_values.data(), opts, x1, x2,
+                                                      x3);
+              interp_cube<T, ns, simd_type>(target, data_uniform, ker1, ker2, ker3, i1,
+                                            i2, i3, N1, N2, N3);
+              break;
+            default: // can't get here
+              FINUFFT_UNREACHABLE;
+              break;
+            }
           }
         }
       } // end loop over targets in chunk
@@ -2051,37 +2548,25 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel(
   return 0;
 }
 
-template<typename T, uint16_t NS>
-static int interpSorted_dispatch(
-    const std::vector<BIGINT> &sort_indices, const UBIGINT N1, const UBIGINT N2,
-    const UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, const UBIGINT M,
-    T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz,
-    T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) {
-  static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD,
-                "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)");
-  if constexpr (NS == MIN_NSPREAD) { // Base case
-    if (opts.kerevalmeth)
-      return interpSorted_kernel<T, MIN_NSPREAD, true>(
-          sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts);
-    else {
-      return interpSorted_kernel<T, MIN_NSPREAD, false>(
-          sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts);
-    }
-  } else {
-    if (opts.nspread == NS) {
-      if (opts.kerevalmeth) {
-        return interpSorted_kernel<T, NS, true>(sort_indices, N1, N2, N3, data_uniform, M,
-                                                kx, ky, kz, data_nonuniform, opts);
-      } else {
-        return interpSorted_kernel<T, NS, false>(sort_indices, N1, N2, N3, data_uniform,
-                                                 M, kx, ky, kz, data_nonuniform, opts);
-      }
-    } else {
-      return interpSorted_dispatch<T, NS - 1>(sort_indices, N1, N2, N3, data_uniform, M,
-                                              kx, ky, kz, data_nonuniform, opts);
-    }
+namespace {
+
+template<typename T> struct InterpSortedCaller {
+  const std::vector<BIGINT> &sort_indices;
+  UBIGINT N1, N2, N3;
+  T *data_uniform;
+  UBIGINT M;
+  T *kx;
+  T *ky;
+  T *kz;
+  T *data_nonuniform;
+  const finufft_spread_opts &opts;
+  template<int NS, int KM> int operator()() const {
+    return interpSorted_kernel<T, NS, KM>(sort_indices, N1, N2, N3, data_uniform, M, kx,
+                                          ky, kz, data_nonuniform, opts);
   }
-}
+};
+
+} // namespace
 
 template<typename T>
 static int interpSorted(
@@ -2089,8 +2574,13 @@ static int interpSorted(
     const UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, const UBIGINT M,
     T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz,
     T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) {
-  return interpSorted_dispatch<T, MAX_NSPREAD>(sort_indices, N1, N2, N3, data_uniform, M,
-                                               kx, ky, kz, data_nonuniform, opts);
+  InterpSortedCaller<T> caller{sort_indices,    N1,  N2, N3, data_uniform, M, kx, ky, kz,
+                               data_nonuniform, opts};
+  using NsSeq  = make_range<MIN_NSPREAD, MAX_NSPREAD>;
+  using KerSeq = std::make_integer_sequence<int, KEREVAL_METHODS>;
+  auto params  = std::make_tuple(DispatchParam<NsSeq>{opts.nspread},
+                                 DispatchParam<KerSeq>{opts.kerevalmeth});
+  return dispatch(caller, params);
 }
 
 template<typename T>
@@ -2177,6 +2667,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(
   opts.sort             = 2; // 2:auto-choice
   opts.kerpad           = 0; // affects only evaluate_kernel_vector
   opts.kerevalmeth      = kerevalmeth;
+  opts.simd             = 2;
   opts.upsampfac        = upsampfac;
   opts.nthreads         = 0; // all avail
   opts.sort_threads     = 0; // 0:auto-choice
@@ -2198,7 +2689,8 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(
   if (upsampfac == 2.0)                    // standard sigma (see SISC paper)
     ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of 10
   else                                     // custom sigma
-    ns = std::ceil(-log(eps) / (T(PI) * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1
+    ns = std::ceil(-log(eps) / (T(PI) * sqrt(1.0 - 1.0 / upsampfac))); // formula,
+                                                                       // gam=1
   ns = max(2, ns);        // (we don't have ns=1 version yet)
   if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules
     if (showwarn)
@@ -2220,8 +2712,8 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(
   if (ns == 4) betaoverns = 2.38;
   if (upsampfac != 2.0) { // again, override beta for custom sigma
     T gamma    = 0.97;    // must match devel/gen_all_horner_C_code.m !
-    betaoverns = gamma * T(PI) * (1.0 - 1.0 / (2 * upsampfac)); // formula based on
-                                                                // cutoff
+    betaoverns = gamma * T(PI) * (1.0 - 1.0 / (2 * upsampfac)); // formula based
+                                                                // on cutoff
   }
   opts.ES_beta = betaoverns * ns; // set the kernel beta parameter
   if (debug)
@@ -2301,26 +2793,24 @@ T evaluate_kernel_horner_dispatch(T x, const finufft_spread_opts &opts) {
   if constexpr (ns == MIN_NSPREAD) { // Base case
     if (opts.upsampfac == 2.0) {
       return evaluate_kernel_horner_kernel<T, MIN_NSPREAD, 200>(x, opts);
-    } else if (opts.upsampfac == 1.25) {
+    }
+    if (opts.upsampfac == 1.25) {
       return evaluate_kernel_horner_kernel<T, MIN_NSPREAD, 125>(x, opts);
-    } else {
-      fprintf(stderr, "[%s] upsampfac (%lf) not supported!\n", __func__, opts.upsampfac);
-      return 0.0;
     }
+    fprintf(stderr, "[%s] upsampfac (%lf) not supported!\n", __func__, opts.upsampfac);
+    return 0.0;
   } else {
     if (opts.nspread == ns) {
       if (opts.upsampfac == 2.0) {
         return evaluate_kernel_horner_kernel<T, ns, 200>(x, opts);
-      } else if (opts.upsampfac == 1.25) {
+      }
+      if (opts.upsampfac == 1.25) {
         return evaluate_kernel_horner_kernel<T, ns, 125>(x, opts);
-      } else {
-        fprintf(stderr, "[%s] upsampfac (%lf) not supported!\n", __func__,
-                opts.upsampfac);
-        return 0.0;
       }
-    } else {
-      return evaluate_kernel_horner_dispatch<T, ns - 1>(x, opts);
+      fprintf(stderr, "[%s] upsampfac (%lf) not supported!\n", __func__, opts.upsampfac);
+      return 0.0;
     }
+    return evaluate_kernel_horner_dispatch<T, ns - 1>(x, opts);
   }
 }
 
diff --git a/src/cuda/utils.cpp b/src/utils.cpp
similarity index 67%
rename from src/cuda/utils.cpp
rename to src/utils.cpp
index 361a41e0e..7774db3a1 100644
--- a/src/cuda/utils.cpp
+++ b/src/utils.cpp
@@ -1,32 +1,20 @@
-#include <cufinufft/utils.h>
+#include <cmath>
+#include <common/common.h>
 
-namespace cufinufft {
-namespace utils {
+#ifdef __CUDACC__
+#include <cufinufft/types.h>
+#endif
 
-CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b)
-// finds even integer not less than n, with prime factors no larger than 5
-// (ie, "smooth") and is a multiple of b (b is a number that the only prime
-// factors are 2,3,5). Adapted from fortran in hellskitchen. Barnett 2/9/17
-// changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
-// added condition about b, Melody Shih 05/31/20
-{
-  if (n <= 2) return 2;
-  if (n % 2 == 1) n += 1;                // even
-  CUFINUFFT_BIGINT nplus  = n - 2;       // to cancel out the +=2 at start of loop
-  CUFINUFFT_BIGINT numdiv = 2;           // a dummy that is >1
-  while ((numdiv > 1) || (nplus % b != 0)) {
-    nplus += 2;                          // stays even
-    numdiv = nplus;
-    while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5...
-    while (numdiv % 3 == 0) numdiv /= 3;
-    while (numdiv % 5 == 0) numdiv /= 5;
-  }
-  return nplus;
-}
+namespace finufft {
+namespace common {
 
 void gaussquad(int n, double *xgl, double *wgl) {
-  // copied from FINUFFT/src/finufft_utils.cpp; see that for explanation.
-  // Note: M_PI needed not PI; defined in utils.h
+  // n-node Gauss-Legendre quadrature, adapted from a code by Jason Kaye (2022-2023),
+  // from the utils file of https://github.com/flatironinstitute/cppdlr version 1.2,
+  // which is Apache-2 licensed. It uses Newton iteration from Chebyshev points.
+  // Double-precision only.
+  // Adapted by Barnett 6/8/25 to write nodes (xgl) and weights (wgl) into arrays
+  // that the user must pre-allocate to length at least n.
 
   double x = 0, dx = 0;
   int convcount = 0;
@@ -35,8 +23,8 @@ void gaussquad(int n, double *xgl, double *wgl) {
   xgl[n / 2] = 0;                   // If odd number of nodes, middle node is 0
   for (int i = 0; i < n / 2; i++) { // Loop through nodes
     convcount = 0;
-    x         = cos((2 * i + 1) * M_PI / (2 * n)); // Initial guess: Chebyshev node
-    while (true) {                                 // Newton iteration
+    x         = std::cos((2 * i + 1) * PI / (2 * n)); // Initial guess: Chebyshev node
+    while (true) {                               // Newton iteration
       auto [p, dp] = leg_eval(n, x);
       dx           = -p / dp;
       x += dx; // Newton step
@@ -63,7 +51,9 @@ void gaussquad(int n, double *xgl, double *wgl) {
 }
 
 std::tuple<double, double> leg_eval(int n, double x) {
-  // copied from FINUFFT/src/finufft_utils.cpp; see that for explanation.
+  // return Legendre polynomial P_n(x) and its derivative P'_n(x).
+  // Uses Legendre three-term recurrence.
+  // Used by gaussquad above, with which it shares the same authorship and source.
 
   if (n == 0) {
     return {1.0, 0.0};
@@ -81,5 +71,32 @@ std::tuple<double, double> leg_eval(int n, double x) {
   return {p2, n * (x * p2 - p1) / (x * x - 1)};
 }
 
+} // namespace common
+} // namespace finufft
+
+namespace cufinufft {
+namespace utils {
+
+long next235beven(long n, long b)
+// finds even integer not less than n, with prime factors no larger than 5
+// (ie, "smooth") and is a multiple of b (b is a number that the only prime
+// factors are 2,3,5). Adapted from fortran in hellskitchen. Barnett 2/9/17
+// changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
+// added condition about b, Melody Shih 05/31/20
+{
+  if (n <= 2) return 2;
+  if (n % 2 == 1) n += 1;                // even
+  long nplus  = n - 2;                   // to cancel out the +=2 at start of loop
+  long numdiv = 2;                       // a dummy that is >1
+  while ((numdiv > 1) || (nplus % b != 0)) {
+    nplus += 2;                          // stays even
+    numdiv = nplus;
+    while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5...
+    while (numdiv % 3 == 0) numdiv /= 3;
+    while (numdiv % 5 == 0) numdiv /= 5;
+  }
+  return nplus;
+}
+
 } // namespace utils
 } // namespace cufinufft
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 10ccf86d6..e1ba7c8ba 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -10,7 +10,9 @@ set(TESTS
     finufft3dmany_test
     finufft3dkernel_test
     spreadinterp1d_test
+    spread_simd_test
     adjointness
+    hint_nj_test
 )
 
 foreach(TEST ${TESTS})
@@ -38,9 +40,11 @@ finufft_link_test(testutils)
 add_test(NAME run_testutils COMMAND testutils WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
 if(NOT FINUFFT_USE_DUCC0 AND FINUFFT_USE_OPENMP)
+    find_package(OpenMP COMPONENTS CXX REQUIRED)
     add_executable(fftw_lock_test fftw_lock_test.cpp)
     target_compile_features(fftw_lock_test PRIVATE cxx_std_17)
     finufft_link_test(fftw_lock_test)
+    target_link_libraries(fftw_lock_test PRIVATE OpenMP::OpenMP_CXX)
     add_test(NAME run_fftw_lock_test COMMAND fftw_lock_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 endif()
 
@@ -96,7 +100,11 @@ function(add_tests_with_prec PREC REQ_TOL CHECK_TOL SUFFIX)
         WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
     )
 
+    add_test(NAME spread_simd_${PREC} COMMAND spread_simd_test${SUFFIX} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
     add_test(NAME run_adjointness_${PREC} COMMAND adjointness${SUFFIX} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+    add_test(NAME run_hint_nj_${PREC} COMMAND hint_nj_test${SUFFIX} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 endfunction()
 
 # use above function to actually add the tests, with certain requested and check
diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt
index c7f21689f..1ee87f185 100644
--- a/test/cuda/CMakeLists.txt
+++ b/test/cuda/CMakeLists.txt
@@ -5,10 +5,9 @@ foreach(srcfile ${test_src})
     get_filename_component(executable ${executable} NAME)
     add_executable(${executable} ${srcfile})
     target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-    find_library(MathLib m)
     target_link_libraries(${executable} cufinufft CUDA::cufft CUDA::cudart)
-    if(MathLib)
-        target_link_libraries(${executable} ${MathLib})
+    if(MATH_LIBRARY)
+        target_link_libraries(${executable} ${MATH_LIBRARY})
     endif()
     target_compile_features(${executable} PUBLIC cxx_std_17)
     set_target_properties(
diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu
index 63f286d35..e783dea2f 100644
--- a/test/cuda/cufinufft1d_test.cu
+++ b/test/cuda/cufinufft1d_test.cu
@@ -8,6 +8,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
+#include <common/common.h>
 #include <cufinufft.h>
 
 #include "../utils/dirft1d.hpp"
@@ -18,6 +19,8 @@
 
 constexpr auto TEST_BIGPROB = 1e8;
 
+using ::finufft::common::PI;
+
 template<typename T>
 int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag,
              double upsampfac) {
@@ -42,7 +45,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag,
 
   // Making data
   for (int i = 0; i < M; i++) {
-    x[i] = M_PI * randm11(); // x in [-pi,pi)
+    x[i] = PI * randm11(); // x in [-pi,pi)
   }
 
   if (type == 1) {
diff --git a/test/cuda/cufinufft1dspreadinterponly_test.cu b/test/cuda/cufinufft1dspreadinterponly_test.cu
index 9b7f80b6f..4c818c5a6 100644
--- a/test/cuda/cufinufft1dspreadinterponly_test.cu
+++ b/test/cuda/cufinufft1dspreadinterponly_test.cu
@@ -4,6 +4,7 @@
 #include <iostream>
 #include <random>
 
+#include <common/common.h>
 #include <cufinufft.h>
 
 #include <cufinufft/contrib/helper_cuda.h>
@@ -14,7 +15,9 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
-using cufinufft::utils::infnorm;
+#include "../utils/norms.hpp"
+
+using ::finufft::common::PI;
 
 template<typename T>
 int run_test(int N1, int M, T tol, T checktol, int iflag, double upsampfac) {
@@ -90,7 +93,7 @@ int run_test(int N1, int M, T tol, T checktol, int iflag, double upsampfac) {
 
   // Making data
   for (int i = 0; i < M; i++) {
-    x[i] = M_PI * randm11(); // x in [-pi,pi)
+    x[i] = PI * randm11(); // x in [-pi,pi)
   }
 
   for (int i = 0; i < M; i++) {
diff --git a/test/cuda/cufinufft2d1nupts_test.cu b/test/cuda/cufinufft2d1nupts_test.cu
index d1a25fb20..95af119fb 100644
--- a/test/cuda/cufinufft2d1nupts_test.cu
+++ b/test/cuda/cufinufft2d1nupts_test.cu
@@ -7,6 +7,7 @@
 #include <iostream>
 #include <random>
 
+#include <common/common.h>
 #include <cufinufft.h>
 
 #include <cufinufft/impl.h>
@@ -16,7 +17,9 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
-using cufinufft::utils::infnorm;
+#include "../utils/norms.hpp"
+
+using ::finufft::common::PI;
 
 template<typename T> int run_test(int method) {
   int N1 = 100;
@@ -49,15 +52,15 @@ template<typename T> int run_test(int method) {
 
   // Making data
   for (int i = 0; i < M1; i++) {
-    x1[i] = M_PI * randm11(); // x in [-pi,pi)
-    y1[i] = M_PI * randm11();
+    x1[i] = PI * randm11(); // x in [-pi,pi)
+    y1[i] = PI * randm11();
     c1[i].real(randm11());
     c1[i].imag(randm11());
   }
 
   for (int i = 0; i < M2; i++) {
-    x2[i] = M_PI * randm11(); // x in [-pi,pi)
-    y2[i] = M_PI * randm11();
+    x2[i] = PI * randm11(); // x in [-pi,pi)
+    y2[i] = PI * randm11();
     c2[i].real(randm11());
     c2[i].imag(randm11());
   }
diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu
index c3e666691..917d2d3e6 100644
--- a/test/cuda/cufinufft2d_test.cu
+++ b/test/cuda/cufinufft2d_test.cu
@@ -6,6 +6,7 @@
 #include <limits>
 #include <random>
 
+#include <common/common.h>
 #include <cufinufft.h>
 
 #include <cufinufft/impl.h>
@@ -20,6 +21,8 @@
 
 constexpr auto TEST_BIGPROB = 1e8;
 
+using ::finufft::common::PI;
+
 template<typename T>
 int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag,
              double upsampfac) {
@@ -39,8 +42,8 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int
 
   // Making data
   for (int i = 0; i < M; i++) {
-    x[i] = M_PI * randm11(); // x in [-pi,pi)
-    y[i] = M_PI * randm11();
+    x[i] = PI * randm11(); // x in [-pi,pi)
+    y[i] = PI * randm11();
   }
   if (type == 1) {
     for (int i = 0; i < M; i++) {
diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu
index fd7341be3..fb254d56e 100644
--- a/test/cuda/cufinufft2dmany_test.cu
+++ b/test/cuda/cufinufft2dmany_test.cu
@@ -6,6 +6,7 @@
 #include <limits>
 #include <random>
 
+#include <common/common.h>
 #include <cufinufft.h>
 
 #include <cufinufft/impl.h>
@@ -15,7 +16,9 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
-using cufinufft::utils::infnorm;
+#include "../utils/norms.hpp"
+
+using ::finufft::common::PI;
 
 template<typename T>
 int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize, int M,
@@ -40,8 +43,8 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
 
   // Making data
   for (int i = 0; i < M; i++) {
-    x[i] = M_PI * randm11(); // x in [-pi,pi)
-    y[i] = M_PI * randm11();
+    x[i] = PI * randm11(); // x in [-pi,pi)
+    y[i] = PI * randm11();
   }
   if (type == 1) {
     for (int i = 0; i < ntransf * M; i++) {
@@ -61,8 +64,8 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
     s.resize(N1 * N2);
     t.resize(N1 * N2);
     for (int i = 0; i < N1 * N2; i++) {
-      s[i] = M_PI * randm11();
-      t[i] = M_PI * randm11();
+      s[i] = PI * randm11();
+      t[i] = PI * randm11();
     }
     d_s = s;
     d_t = t;
diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu
index c823ce0e1..096f26ca8 100644
--- a/test/cuda/cufinufft3d_test.cu
+++ b/test/cuda/cufinufft3d_test.cu
@@ -5,6 +5,7 @@
 #include <iostream>
 #include <random>
 
+#include <common/common.h>
 #include <cufinufft.h>
 
 #include <cufinufft/impl.h>
@@ -19,6 +20,8 @@
 
 constexpr auto TEST_BIGPROB = 1e8;
 
+using ::finufft::common::PI;
+
 template<typename T>
 int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T checktol,
              int iflag, double upsampfac) {
@@ -39,9 +42,9 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check
 
   // Making data
   for (int i = 0; i < M; i++) {
-    x[i] = M_PI * randm11(); // x in [-pi,pi)
-    y[i] = M_PI * randm11();
-    z[i] = M_PI * randm11();
+    x[i] = PI * randm11(); // x in [-pi,pi)
+    y[i] = PI * randm11();
+    z[i] = PI * randm11();
   }
   if (type == 1) {
     for (int i = 0; i < M; i++) {
diff --git a/test/cuda/public_api_test.c b/test/cuda/public_api_test.c
index 7d0dceef2..33f0ab954 100644
--- a/test/cuda/public_api_test.c
+++ b/test/cuda/public_api_test.c
@@ -6,6 +6,8 @@
 
 #include <complex.h>
 
+static const double PI = 3.141592653589793238462643383279502884;
+
 int test_float(int M, int N) {
   // Size of the grid as an array.
   int64_t modes[1] = {N};
@@ -36,7 +38,7 @@ int test_float(int M, int N) {
   srand(0);
 
   for (int j = 0; j < M; ++j) {
-    x[j] = 2 * M_PI * (((float)rand()) / RAND_MAX - 1);
+    x[j] = 2 * PI * (((float)rand()) / RAND_MAX - 1);
     c[j] =
         (2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1);
   }
@@ -123,7 +125,7 @@ int test_double(int M, int N) {
   srand(0);
 
   for (int j = 0; j < M; ++j) {
-    x[j] = 2 * M_PI * (((double)rand()) / RAND_MAX - 1);
+    x[j] = 2 * PI * (((double)rand()) / RAND_MAX - 1);
     c[j] =
         (2 * ((double)rand()) / RAND_MAX - 1) + I * (2 * ((double)rand()) / RAND_MAX - 1);
   }
diff --git a/test/hint_nj_test.cpp b/test/hint_nj_test.cpp
new file mode 100644
index 000000000..e8bb62700
--- /dev/null
+++ b/test/hint_nj_test.cpp
@@ -0,0 +1,370 @@
+// Cases covered:
+// A) If opts.upsampfac != 0 (1.25 or 2.00), hint_nj is ignored (all types).
+// B) If opts.upsampfac == 0 and hint_nj == 0 (types 1 & 2), upsampfac is chosen at the
+//    first setpts from the observed density and may change on any following setpts.
+// C) If opts.upsampfac == 0 and hint_nj > 0 (types 1 & 2), upsampfac is chosen at
+//    makeplan from density = hint_nj / N, and may change on any following setpts based
+//    on the actual nj (i.e., heuristic re-evaluation).
+// D) Type 3 ignores hint_nj entirely; upsampfac is chosen inside setpts from geometry-
+//    based density and may change on any following setpts.
+//
+//    and may be recomputed if nj changes.
+//
+
+#include <finufft/heuristics.hpp>
+#include <finufft/test_defs.h>
+
+#include <cmath>
+#include <cstdio>
+#include <vector>
+#include <type_traits>
+#include <limits>
+
+using finufft::heuristics::bestUpsamplingFactor;
+
+static int g_debug = 1;
+
+// -------- precision-dependent globals --------
+// Tolerance used for numerical equality checks (assert), depends on FLT.
+static const double g_eps = (sizeof(FLT) == 4 ? 1e-6 : 1e-12);
+
+// Choose NUFFT accuracy tol per precision.
+// float: pick > 1e-8 (and also > eps_float*100 ~ 1e-5) to avoid the clamp-to-2.0 path.
+// double: pick 1e-8 (> 1e-9) to avoid the clamp-to-2.0 path.
+static inline FLT get_tol() {
+  if constexpr (std::is_same<FLT, float>::value) {
+    return (FLT)2e-5;  // > 1e-8 and > ~1.2e-5
+  } else {
+    return (FLT)1e-8;  // > 1e-9
+  }
+}
+
+// --------------------------------------------
+
+static void fill_rand(FLT *a, BIGINT n, FLT scale = (FLT)PI) {
+  for (BIGINT i = 0; i < n; ++i) a[i] = scale * randm11();
+}
+
+static int fail(const char *where, const char *msg) {
+  std::printf("[FAIL] %s: %s\n", where, msg);
+  return 1;
+}
+
+static int assert_eq_double(const char *where, const char *what, double got,
+                            double expect, double eps = g_eps) {
+  if (std::fabs(got - expect) > eps) {
+    std::printf("[FAIL] %s: %s %.12g != %.12g (|diff|=%.3g > %.3g)\n",
+                where, what, got, expect, std::fabs(got - expect), eps);
+    return 1;
+  }
+  if (g_debug) {
+    std::printf("[ OK ] %s: %s = %.6f\n", where, what, got);
+  }
+  return 0;
+}
+
+// ---------- Type 1 & 2 (2D) ----------
+// We choose N = 8x8 => N() = 64 so that crossing density thresholds is easy:
+//   density = nj / N() -> with nj = 64   => density=1  (often 1.25)
+//                           nj = 4096 => density=64 (often 2.0)
+static int test_type12_forced(int type, double forced_usf) {
+  const char *WHERE = (type == 1 ? "type1_forced_2D" : "type2_forced_2D");
+  const int dim     = 2;
+  const FLT tol     = get_tol();
+
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
+  opts.debug     = g_debug - 1;
+  opts.nthreads  = 1;    // single-thread tables
+  opts.hint_nj   = 4000; // ignored when upsampfac is forced
+  opts.upsampfac = forced_usf;
+
+  BIGINT Nm[2] = {8, 8}; // 2D grid: N()=64
+  FINUFFT_PLAN plan;
+  int ier = FINUFFT_MAKEPLAN(type, dim, Nm, +1, 1, tol, &plan, &opts);
+  if (ier) return fail(WHERE, "makeplan error");
+
+  if (int e = assert_eq_double(WHERE, "plan upsampfac", plan->opts.upsampfac, forced_usf))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  // setpts #1 (small density) - remains forced
+  BIGINT nj1 = 64; // density=1
+  std::vector<FLT> x1(nj1), y1(nj1);
+  fill_rand(x1.data(), nj1);
+  fill_rand(y1.data(), nj1);
+  ier = FINUFFT_SETPTS(plan, nj1, x1.data(), y1.data(), nullptr,
+                       0, nullptr, nullptr, nullptr);
+  if (ier) { FINUFFT_DESTROY(plan); return fail(WHERE, "setpts #1 error"); }
+  if (int e = assert_eq_double(WHERE, "setpts #1 upsampfac", plan->opts.upsampfac,
+                               forced_usf))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  // setpts #2 (large density) - still forced
+  BIGINT nj2 = 4096; // density=64
+  std::vector<FLT> x2(nj2), y2(nj2);
+  fill_rand(x2.data(), nj2);
+  fill_rand(y2.data(), nj2);
+  ier = FINUFFT_SETPTS(plan, nj2, x2.data(), y2.data(), nullptr,
+                       0, nullptr, nullptr, nullptr);
+  if (ier) { FINUFFT_DESTROY(plan); return fail(WHERE, "setpts #2 error"); }
+  if (int e = assert_eq_double(WHERE, "setpts #2 upsampfac", plan->opts.upsampfac,
+                               forced_usf))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  FINUFFT_DESTROY(plan);
+  return 0;
+}
+
+static int test_type12_hint0(int type) {
+  const char *WHERE = (type == 1 ? "type1_hint0_2D" : "type2_hint0_2D");
+  const int dim     = 2;
+  const FLT tol     = get_tol();
+
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
+  opts.debug     = g_debug - 1;
+  opts.nthreads  = 1;
+  opts.hint_nj   = 0;     // no hint
+  opts.upsampfac = 0.0;   // heuristic decides at first setpts
+
+  BIGINT Nm[2] = {8, 8};  // N() = 64
+  FINUFFT_PLAN plan;
+  int ier = FINUFFT_MAKEPLAN(type, dim, Nm, +1, 1, tol, &plan, &opts);
+  if (ier) return fail(WHERE, "makeplan error");
+
+  if (int e = assert_eq_double(WHERE, "plan-time upsampfac", plan->opts.upsampfac, 0.0))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  // First setpts: smaller density
+  BIGINT nj1 = 64; // density=1
+  std::vector<FLT> x1(nj1), y1(nj1);
+  fill_rand(x1.data(), nj1);
+  fill_rand(y1.data(), nj1);
+  ier = FINUFFT_SETPTS(plan, nj1, x1.data(), y1.data(), nullptr,
+                       0, nullptr, nullptr, nullptr);
+  if (ier) { FINUFFT_DESTROY(plan); return fail(WHERE, "setpts #1 error"); }
+
+  double density1 = double(nj1) / double(plan->N());
+  double expect1 =
+      bestUpsamplingFactor<FLT>(opts.nthreads, (FLT)density1, dim, type, tol);
+  if (int e = assert_eq_double(WHERE, "post-setpts #1 upsampfac",
+                               plan->opts.upsampfac, expect1))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  // Second setpts: much larger density
+  BIGINT nj2 = 4096; // density=64
+  std::vector<FLT> x2(nj2), y2(nj2);
+  fill_rand(x2.data(), nj2);
+  fill_rand(y2.data(), nj2);
+  ier = FINUFFT_SETPTS(plan, nj2, x2.data(), y2.data(), nullptr,
+                       0, nullptr, nullptr, nullptr);
+  if (ier) { FINUFFT_DESTROY(plan); return fail(WHERE, "setpts #2 error"); }
+
+  double density2 = double(nj2) / double(plan->N());
+  double expect2 =
+      bestUpsamplingFactor<FLT>(opts.nthreads, (FLT)density2, dim, type, tol);
+  if (int e = assert_eq_double(WHERE, "post-setpts #2 upsampfac (may change)",
+                               plan->opts.upsampfac, expect2))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  FINUFFT_DESTROY(plan);
+  return 0;
+}
+
+static int test_type12_hintpos(int type) {
+  const char *WHERE = (type == 1 ? "type1_hintpos_2D" : "type2_hintpos_2D");
+  const int dim     = 2;
+  const FLT tol     = get_tol();
+
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
+  opts.debug     = g_debug - 1;
+  opts.nthreads  = 1;
+  opts.hint_nj   = 128;   // density_hint = 128/64 = 2 (often 1.25)
+  opts.upsampfac = 0.0;   // decide using hint at makeplan
+
+  BIGINT Nm[2] = {8, 8};  // N()=64
+  FINUFFT_PLAN plan;
+  int ier = FINUFFT_MAKEPLAN(type, dim, Nm, +1, 1, tol, &plan, &opts);
+  if (ier) return fail(WHERE, "makeplan error");
+
+  double density_hint = double(opts.hint_nj) / double(plan->N());
+  double expect_plan =
+      bestUpsamplingFactor<FLT>(opts.nthreads, (FLT)density_hint, dim, type, tol);
+  if (int e = assert_eq_double(WHERE, "plan-time upsampfac from hint",
+                               plan->opts.upsampfac, expect_plan))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  // First setpts with nj == hint_nj -> shouldn't change
+  BIGINT nj1 = opts.hint_nj;
+  std::vector<FLT> x1(nj1), y1(nj1);
+  fill_rand(x1.data(), nj1);
+  fill_rand(y1.data(), nj1);
+  ier = FINUFFT_SETPTS(plan, nj1, x1.data(), y1.data(), nullptr,
+                       0, nullptr, nullptr, nullptr);
+  if (ier) { FINUFFT_DESTROY(plan); return fail(WHERE, "setpts #1 error"); }
+  if (int e = assert_eq_double(WHERE, "post-setpts #1 upsampfac (nj==hint)",
+                               plan->opts.upsampfac, expect_plan))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  // Second setpts with large nj to cross thresholds (may update)
+  BIGINT nj2 = 4096; // density=64
+  std::vector<FLT> x2(nj2), y2(nj2);
+  fill_rand(x2.data(), nj2);
+  fill_rand(y2.data(), nj2);
+  ier = FINUFFT_SETPTS(plan, nj2, x2.data(), y2.data(), nullptr,
+                       0, nullptr, nullptr, nullptr);
+  if (ier) { FINUFFT_DESTROY(plan); return fail(WHERE, "setpts #2 error"); }
+
+  double density2 = double(nj2) / double(plan->N());
+  double expect2 =
+      bestUpsamplingFactor<FLT>(opts.nthreads, (FLT)density2, dim, type, tol);
+  if (int e = assert_eq_double(WHERE, "post-setpts #2 upsampfac (nj!=hint)",
+                               plan->opts.upsampfac, expect2))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  FINUFFT_DESTROY(plan);
+  return 0;
+}
+
+// ---------- Type 3 (2D) ----------
+static int test_type3_forced(double forced_usf) {
+  const char *WHERE = "type3_forced_2D";
+  const int dim     = 2;
+  const FLT tol     = get_tol();
+
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
+  opts.debug     = g_debug - 1;
+  opts.nthreads  = 1;
+  opts.hint_nj   = 4000;       // ignored in type 3 when upsampfac!=0
+  opts.upsampfac = forced_usf; // forced
+
+  BIGINT Nm[2] = {0, 0};       // type-3 makeplan ignores Nm
+  FINUFFT_PLAN plan;
+  int ier = FINUFFT_MAKEPLAN(3, dim, Nm, +1, 1, tol, &plan, &opts);
+  if (ier) return fail(WHERE, "makeplan error");
+
+  if (int e = assert_eq_double(WHERE, "plan-time upsampfac", plan->opts.upsampfac,
+                               forced_usf))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  BIGINT nk = 16; // some freqs
+  std::vector<FLT> s(nk), t(nk);
+  fill_rand(s.data(), nk);
+  fill_rand(t.data(), nk);
+
+  // setpts #1: small nj, moderate bandwidth
+  BIGINT nj1 = 64;
+  std::vector<FLT> x1(nj1), y1(nj1);
+  fill_rand(x1.data(), nj1, (FLT)(PI * 8));  // moderate width
+  fill_rand(y1.data(), nj1, (FLT)(PI * 8));
+  ier = FINUFFT_SETPTS(plan, nj1, x1.data(), y1.data(), nullptr,
+                       nk, s.data(), t.data(), nullptr);
+  if (ier) { FINUFFT_DESTROY(plan); return fail(WHERE, "setpts #1 error"); }
+  if (int e = assert_eq_double(WHERE, "post-setpts #1 upsampfac (forced)",
+                               plan->opts.upsampfac, forced_usf))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  // setpts #2: very different nj/bandwidth, still forced
+  BIGINT nj2 = 4096;
+  std::vector<FLT> x2(nj2), y2(nj2);
+  fill_rand(x2.data(), nj2, (FLT)(PI * 0.1)); // very small width
+  fill_rand(y2.data(), nj2, (FLT)(PI * 0.1));
+  ier = FINUFFT_SETPTS(plan, nj2, x2.data(), y2.data(), nullptr,
+                       nk, s.data(), t.data(), nullptr);
+  if (ier) { FINUFFT_DESTROY(plan); return fail(WHERE, "setpts #2 error"); }
+  if (int e = assert_eq_double(WHERE, "post-setpts #2 upsampfac (forced)",
+                               plan->opts.upsampfac, forced_usf))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  FINUFFT_DESTROY(plan);
+  return 0;
+}
+
+// For type 3 with heuristic (upsampfac==0): hint ignored; upsampfac selected in setpts.
+// NOTE: bestUpsamplingFactor currently returns 1.25 for type-3, independent of density.
+// We still vary geometry (bandwidth) between calls to prepare for future changes.
+static int test_type3_hint_ignored() {
+  const char *WHERE = "type3_hint_ignored_2D";
+  const int dim     = 2;
+  const FLT tol     = get_tol();
+
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
+  opts.debug     = g_debug - 1;
+  opts.nthreads  = 1;
+  opts.hint_nj   = 4000; // ignored
+  opts.upsampfac = 0.0;  // heuristic decides in setpts
+
+  BIGINT Nm[2] = {0, 0};
+  FINUFFT_PLAN plan;
+  int ier = FINUFFT_MAKEPLAN(3, dim, Nm, +1, 1, tol, &plan, &opts);
+  if (ier) return fail(WHERE, "makeplan error");
+
+  if (int e = assert_eq_double(WHERE, "plan-time upsampfac", plan->opts.upsampfac, 0.0))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  BIGINT nk = 16;
+  std::vector<FLT> s(nk), t(nk);
+  fill_rand(s.data(), nk);
+  fill_rand(t.data(), nk);
+
+  // setpts #1: large bandwidth
+  BIGINT nj1 = 256;
+  std::vector<FLT> x1(nj1), y1(nj1);
+  fill_rand(x1.data(), nj1, (FLT)(PI * 64));  // very wide
+  fill_rand(y1.data(), nj1, (FLT)(PI * 64));
+  ier = FINUFFT_SETPTS(plan, nj1, x1.data(), y1.data(), nullptr,
+                       nk, s.data(), t.data(), nullptr);
+  if (ier) { FINUFFT_DESTROY(plan); return fail(WHERE, "setpts #1 error"); }
+
+  double density1 = double(nj1) / double(plan->N());
+  double expect1  = bestUpsamplingFactor<FLT>(opts.nthreads, (FLT)density1, dim, 3, tol);
+  if (int e = assert_eq_double(WHERE, "post-setpts #1 upsampfac",
+                               plan->opts.upsampfac, expect1))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  // setpts #2: tiny bandwidth (should not change under current heuristic)
+  BIGINT nj2 = 512;
+  std::vector<FLT> x2(nj2), y2(nj2);
+  fill_rand(x2.data(), nj2, (FLT)(PI * 0.01)); // very narrow
+  fill_rand(y2.data(), nj2, (FLT)(PI * 0.01));
+  ier = FINUFFT_SETPTS(plan, nj2, x2.data(), y2.data(), nullptr,
+                       nk, s.data(), t.data(), nullptr);
+  if (ier) { FINUFFT_DESTROY(plan); return fail(WHERE, "setpts #2 error"); }
+
+  double density2 = double(nj2) / double(plan->N());
+  double expect2  = bestUpsamplingFactor<FLT>(opts.nthreads, (FLT)density2, dim, 3, tol);
+  if (int e = assert_eq_double(WHERE, "post-setpts #2 upsampfac (may change in future)",
+                               plan->opts.upsampfac, expect2))
+  { FINUFFT_DESTROY(plan); return e; }
+
+  FINUFFT_DESTROY(plan);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  if (argc > 1) g_debug = std::atoi(argv[1]);
+
+  // Type 1 & 2: forced upsampfac (hint ignored)
+  if (int e = test_type12_forced(1, 2.00)) return e;
+  if (int e = test_type12_forced(2, 2.00)) return e;
+
+  // Type 1 & 2 (2D): hint==0 -> decide at first setpts, may change on next
+  if (int e = test_type12_hint0(1)) return e;
+  if (int e = test_type12_hint0(2)) return e;
+
+  // Type 1 & 2 (2D): hint>0 -> decide at makeplan, may update if nj != hint
+  if (int e = test_type12_hintpos(1)) return e;
+  if (int e = test_type12_hintpos(2)) return e;
+
+  // Type 3 (2D): forced upsampfac (hint ignored)
+  if (int e = test_type3_forced(2.00)) return e;
+
+  // Type 3 (2D): hint ignored, heuristic decides inside setpts; we vary bandwidth
+  if (int e = test_type3_hint_ignored()) return e;
+
+  if (g_debug) std::printf("All tests passed.\n");
+  return 0;
+}
diff --git a/test/spread_simd_test.cpp b/test/spread_simd_test.cpp
new file mode 100644
index 000000000..4c3211b78
--- /dev/null
+++ b/test/spread_simd_test.cpp
@@ -0,0 +1,161 @@
+#include "utils/norms.hpp"
+#include <finufft/test_defs.h>
+#include <limits>
+
+using namespace std;
+using namespace finufft::utils;
+
+int main() {
+  unsigned int se = 1;
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
+  opts.spreadinterponly = 1;
+  int isign             = +1;
+  double tol            = 1e-6;
+  const FLT thresh      = 50 * numeric_limits<FLT>::epsilon();
+  int fail              = 0;
+
+  // 1D spread
+  {
+    BIGINT M = 32, N = 64;
+    vector<FLT> x(M);
+    vector<CPX> c(M);
+    vector<CPX> F0(N), F1(N), F2(N);
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = PI * randm11r(&se);
+      c[j] = crandm11r(&se);
+    }
+    opts.spread_simd = 0;
+    FINUFFT1D1(M, x.data(), c.data(), isign, tol, N, F0.data(), &opts);
+    opts.spread_simd = 1;
+    FINUFFT1D1(M, x.data(), c.data(), isign, tol, N, F1.data(), &opts);
+    opts.spread_simd = 2;
+    FINUFFT1D1(M, x.data(), c.data(), isign, tol, N, F2.data(), &opts);
+    FLT err0 = relerrtwonorm(N, F0.data(), F2.data());
+    FLT err1 = relerrtwonorm(N, F1.data(), F2.data());
+    printf("spread 1D relerr0 %.3g relerr1 %.3g\n", err0, err1);
+    if (err0 > thresh || err1 > thresh) fail = 1;
+  }
+
+  // 2D spread
+  {
+    BIGINT M = 32, N1 = 32, N2 = 16;
+    vector<FLT> x(M), y(M);
+    vector<CPX> c(M);
+    vector<CPX> F0(N1 * N2), F1(N1 * N2), F2(N1 * N2);
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = PI * randm11r(&se);
+      y[j] = PI * randm11r(&se);
+      c[j] = crandm11r(&se);
+    }
+    opts.spread_simd = 0;
+    FINUFFT2D1(M, x.data(), y.data(), c.data(), isign, tol, N1, N2, F0.data(), &opts);
+    opts.spread_simd = 1;
+    FINUFFT2D1(M, x.data(), y.data(), c.data(), isign, tol, N1, N2, F1.data(), &opts);
+    opts.spread_simd = 2;
+    FINUFFT2D1(M, x.data(), y.data(), c.data(), isign, tol, N1, N2, F2.data(), &opts);
+    FLT err0 = relerrtwonorm(N1 * N2, F0.data(), F2.data());
+    FLT err1 = relerrtwonorm(N1 * N2, F1.data(), F2.data());
+    printf("spread 2D relerr0 %.3g relerr1 %.3g\n", err0, err1);
+    if (err0 > thresh || err1 > thresh) fail = 1;
+  }
+
+  // 3D spread
+  {
+    BIGINT M = 32, N1 = 16, N2 = 16, N3 = 16;
+    vector<FLT> x(M), y(M), z(M);
+    vector<CPX> c(M);
+    vector<CPX> F0(N1 * N2 * N3), F1(N1 * N2 * N3), F2(N1 * N2 * N3);
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = PI * randm11r(&se);
+      y[j] = PI * randm11r(&se);
+      z[j] = PI * randm11r(&se);
+      c[j] = crandm11r(&se);
+    }
+    opts.spread_simd = 0;
+    FINUFFT3D1(M, x.data(), y.data(), z.data(), c.data(), isign, tol, N1, N2, N3,
+               F0.data(), &opts);
+    opts.spread_simd = 1;
+    FINUFFT3D1(M, x.data(), y.data(), z.data(), c.data(), isign, tol, N1, N2, N3,
+               F1.data(), &opts);
+    opts.spread_simd = 2;
+    FINUFFT3D1(M, x.data(), y.data(), z.data(), c.data(), isign, tol, N1, N2, N3,
+               F2.data(), &opts);
+    FLT err0 = relerrtwonorm(N1 * N2 * N3, F0.data(), F2.data());
+    FLT err1 = relerrtwonorm(N1 * N2 * N3, F1.data(), F2.data());
+    printf("spread 3D relerr0 %.3g relerr1 %.3g\n", err0, err1);
+    if (err0 > thresh || err1 > thresh) fail = 1;
+  }
+
+  // 1D interp
+  {
+    BIGINT M = 32, N = 64;
+    vector<FLT> x(M);
+    vector<CPX> c0(M), c1(M), c2(M);
+    vector<CPX> F(N);
+    for (BIGINT j = 0; j < M; ++j) x[j] = PI * randm11r(&se);
+    for (BIGINT m = 0; m < N; ++m) F[m] = crandm11r(&se);
+    opts.spread_simd = 0;
+    FINUFFT1D2(M, x.data(), c0.data(), isign, tol, N, F.data(), &opts);
+    opts.spread_simd = 1;
+    FINUFFT1D2(M, x.data(), c1.data(), isign, tol, N, F.data(), &opts);
+    opts.spread_simd = 2;
+    FINUFFT1D2(M, x.data(), c2.data(), isign, tol, N, F.data(), &opts);
+    FLT err0 = relerrtwonorm(M, c0.data(), c2.data());
+    FLT err1 = relerrtwonorm(M, c1.data(), c2.data());
+    printf("interp 1D relerr0 %.3g relerr1 %.3g\n", err0, err1);
+    if (err0 > thresh || err1 > thresh) fail = 1;
+  }
+
+  // 2D interp
+  {
+    BIGINT M = 32, N1 = 32, N2 = 16;
+    vector<FLT> x(M), y(M);
+    vector<CPX> c0(M), c1(M), c2(M);
+    vector<CPX> F(N1 * N2);
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = PI * randm11r(&se);
+      y[j] = PI * randm11r(&se);
+    }
+    for (BIGINT m = 0; m < N1 * N2; ++m) F[m] = crandm11r(&se);
+    opts.spread_simd = 0;
+    FINUFFT2D2(M, x.data(), y.data(), c0.data(), isign, tol, N1, N2, F.data(), &opts);
+    opts.spread_simd = 1;
+    FINUFFT2D2(M, x.data(), y.data(), c1.data(), isign, tol, N1, N2, F.data(), &opts);
+    opts.spread_simd = 2;
+    FINUFFT2D2(M, x.data(), y.data(), c2.data(), isign, tol, N1, N2, F.data(), &opts);
+    FLT err0 = relerrtwonorm(M, c0.data(), c2.data());
+    FLT err1 = relerrtwonorm(M, c1.data(), c2.data());
+    printf("interp 2D relerr0 %.3g relerr1 %.3g\n", err0, err1);
+    if (err0 > thresh || err1 > thresh) fail = 1;
+  }
+
+  // 3D interp
+  {
+    BIGINT M = 32, N1 = 16, N2 = 16, N3 = 16;
+    vector<FLT> x(M), y(M), z(M);
+    vector<CPX> c0(M), c1(M), c2(M);
+    vector<CPX> F(N1 * N2 * N3);
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = PI * randm11r(&se);
+      y[j] = PI * randm11r(&se);
+      z[j] = PI * randm11r(&se);
+    }
+    for (BIGINT m = 0; m < N1 * N2 * N3; ++m) F[m] = crandm11r(&se);
+    opts.spread_simd = 0;
+    FINUFFT3D2(M, x.data(), y.data(), z.data(), c0.data(), isign, tol, N1, N2, N3,
+               F.data(), &opts);
+    opts.spread_simd = 1;
+    FINUFFT3D2(M, x.data(), y.data(), z.data(), c1.data(), isign, tol, N1, N2, N3,
+               F.data(), &opts);
+    opts.spread_simd = 2;
+    FINUFFT3D2(M, x.data(), y.data(), z.data(), c2.data(), isign, tol, N1, N2, N3,
+               F.data(), &opts);
+    FLT err0 = relerrtwonorm(M, c0.data(), c2.data());
+    FLT err1 = relerrtwonorm(M, c1.data(), c2.data());
+    printf("interp 3D relerr0 %.3g relerr1 %.3g\n", err0, err1);
+    if (err0 > thresh || err1 > thresh) fail = 1;
+  }
+
+  return fail;
+}
diff --git a/test/testutils.cpp b/test/testutils.cpp
index ce8efd735..070e5300b 100644
--- a/test/testutils.cpp
+++ b/test/testutils.cpp
@@ -58,7 +58,7 @@ int main(int argc, char *argv[]) {
   // test Gauss-Legendre quadrature...
   const int n = 16;
   std::vector<double> x(n), w(n);
-  gaussquad(n, x.data(), w.data());
+  finufft::common::gaussquad(n, x.data(), w.data());
   auto f = [](double x) {
     return sin(4 * x + 1.0) + 0.3;
   }; // a test func f(x)