From b988ffb17869b6cf0b23778f990c50a2e99ef231 Mon Sep 17 00:00:00 2001
From: Qi Wu <qiwu@nvidia.com>
Date: Sat, 18 Apr 2026 22:34:50 -0700
Subject: [PATCH 1/5] Add submodule for base project, update .gitignore, and
 enhance CMake configuration

- Added a new submodule for the base project.
- Updated .gitignore to include additional files and directories for better exclusion.
- Enhanced CMake configuration to support standalone builds and improved CUDA settings.
- Modified batch trainer and neural sampler to include new voxel types.
- Introduced a setup script for easier CMake configuration and building.
---
 .gitignore                       | 156 +++++++++++++++++++++++++++++++
 .gitmodules                      |   3 +
 CMakeLists.txt                   | 156 ++++++++++++++++++++-----------
 apps/CMakeLists.txt              |  39 ++------
 apps/batch_trainer.cpp           |  23 +++--
 base                             |   1 +
 core/CMakeLists.txt              |  54 +++--------
 core/network.cu                  |  45 +--------
 core/samplers/neural_sampler.cpp |  23 +++--
 device/CMakeLists.txt            |  36 ++++---
 device/device_impl.cpp           |   2 +-
 setup_cmake.sh                   |  66 +++++++++++++
 tcnn                             |   2 +-
 13 files changed, 409 insertions(+), 197 deletions(-)
 create mode 160000 base
 create mode 100644 setup_cmake.sh

diff --git a/.gitignore b/.gitignore
index 64e24f8..0da745d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,157 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Datasets
+logs/
+.DS_Store
+
+composer/
+eval_metrics_*
 params.json
+params.pt
+
+/deps/
+model*.pt
+output.json
+.vscode/
+
+
+build
+build_dav
+build/
+build_*/
+*.pt
+
+magma/
+imgui.ini
+
+# for sharing test ckpt file
+!SIREN_shadow_test/sample_siren_1499_validation.pt
+SIREN_shadow_test/*.bson
diff --git a/.gitmodules b/.gitmodules
index 6adfc5b..f2652c9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "tcnn"]
 	path = tcnn
 	url = https://github.com/wilsonCernWq/tiny-cuda-nn.git
+[submodule "base"]
+	path = base
+	url = git@github.com:wilsonCernWq/open-volume-renderer.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 194e806..d6cd71c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,69 +1,119 @@
 # ======================================================================== #
-# Copyright 2019-2022 Qi Wu                                                #
+# Copyright 2019-2024 Qi Wu                                                #
 #                                                                          #
-# Licensed under the Apache License, Version 2.0 (the "License");          #
-# you may not use this file except in compliance with the License.         #
-# You may obtain a copy of the License at                                  #
-#                                                                          #
-#     http://www.apache.org/licenses/LICENSE-2.0                           #
-#                                                                          #
-# Unless required by applicable law or agreed to in writing, software      #
-# distributed under the License is distributed on an "AS IS" BASIS,        #
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
-# See the License for the specific language governing permissions and      #
-# limitations under the License.                                           #
+# Licensed under the Apache License, Version 2.0                           #
 # ======================================================================== #
-
 #
-# This project is designed to be compiled with OVR
+# Standalone build — OVR is an in-tree submodule, TCNN is built from source
+# via FetchContent, and the result is installed as a self-contained
+# `instantvnr` Python package directory.
 #
 
-option(OVR_BUILD_MODULE_NNVOLUME "Build Volumetric Neural Representation Device" OFF)
+if (POLICY CMP0048)
+  cmake_policy(SET CMP0048 NEW)
+endif()
+
+cmake_minimum_required(VERSION 3.24)
+project(instantvnr LANGUAGES C CXX CUDA)
+
+# OVR cmake modules (configure_build_type, configure_cxx, configure_cuda …)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/base/cmake")
 
-if(OVR_BUILD_MODULE_NNVOLUME)
+include(configure_build_type)
+include(configure_cxx)
+include(configure_cuda)
+# configure_optix is NOT included here — it unconditionally fetches OptiX headers
+# even when OVR_BUILD_DEVICE_OPTIX7=OFF.  OVR handles optix internally when enabled.
 
-  message(STATUS "Enable Neural Volume Device")
+###############################################################################
+# GLIBCXX ABI — defaults to 1 (GCC ≥5 default); override with
+#   -DIVNR_GLIBCXX_USE_CXX11_ABI=0  when linking against a library built with
+#   the old ABI (e.g. PyTorch wheels that ship _GLIBCXX_USE_CXX11_ABI=0).
+###############################################################################
+option(IVNR_GLIBCXX_USE_CXX11_ABI "Use GLIBCXX CXX11 ABI" ON)
+if(IVNR_GLIBCXX_USE_CXX11_ABI)
+  set(IVNR_GLIBCXX_CXX11_ABI _GLIBCXX_USE_CXX11_ABI=1)
+else()
+  set(IVNR_GLIBCXX_CXX11_ABI _GLIBCXX_USE_CXX11_ABI=0)
+endif()
+message(STATUS "GLIBCXX_CXX11_ABI: ${IVNR_GLIBCXX_USE_CXX11_ABI}")
 
-  if(DEFINED GDT_CUDA_ARCHITECTURES) 
-    message(STATUS "Obtained target architecture from environment variable GDT_CUDA_ARCHITECTURES=${GDT_CUDA_ARCHITECTURES}")
-    set(ENV{TCNN_CUDA_ARCHITECTURES} ${GDT_CUDA_ARCHITECTURES})
-  endif()
+###############################################################################
+# Staging directory — all shared libs and executables land here so the install
+# step can copy them as a single package directory (same pattern as pysampler).
+###############################################################################
+set(_ivnr_staging "${CMAKE_BINARY_DIR}/instantvnr")
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${_ivnr_staging}")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${_ivnr_staging}")
+set(CMAKE_INSTALL_RPATH              "$ORIGIN")
+set(CMAKE_BUILD_WITH_INSTALL_RPATH   ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE  ON)
 
-  ################
-  # tiny-cuda-nn #
-  ################
-  if(NOT TARGET tiny-cuda-nn)
-    set(TCNN_BUILD_BENCHMARK OFF)
-    set(TCNN_BUILD_EXAMPLES OFF)
-    add_subdirectory(tcnn EXCLUDE_FROM_ALL)
-    target_compile_definitions(tiny-cuda-nn PUBLIC    ${TCNN_DEFINITIONS})
-    target_compile_definitions(tiny-cuda-nn INTERFACE TCNN_NAMESPACE=tcnn)
-    target_include_directories(tiny-cuda-nn INTERFACE ${CMAKE_CURRENT_LIST_DIR}/tcnn/include)
-    target_include_directories(tiny-cuda-nn INTERFACE ${CMAKE_CURRENT_LIST_DIR}/tcnn/dependencies)
-    if(UNIX)
-      set_target_properties(fmt PROPERTIES POSITION_INDEPENDENT_CODE ON)
-      set_target_properties(tiny-cuda-nn PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    endif()
-  endif()
+###############################################################################
+# OVR (OpenGL, GLFW, ImGui, GDT math, TBB, TFN module)
+###############################################################################
+set(OVR_BUILD_OPENGL ON CACHE BOOL "" FORCE)
+set(OVR_BUILD_CUDA ON CACHE BOOL "" FORCE)
+set(OVR_BUILD_DEVICE_OPTIX7 OFF CACHE BOOL "" FORCE)
+set(OVR_BUILD_DEVICE_OSPRAY OFF CACHE BOOL "" FORCE)
+set(OVR_BUILD_PYTHON_BINDINGS OFF CACHE BOOL "" FORCE)
+set(OVR_BUILD_APPS OFF CACHE BOOL "" FORCE)
+add_subdirectory(base)
 
-  # Two projects uses different target names ...
-  if((NOT TARGET util) AND (TARGET rendercommon))
-    add_library(util ALIAS rendercommon)
-  endif()
+###############################################################################
+# TBB (may be brought in by OVR; guard against double-find)
+###############################################################################
+if(NOT TARGET TBB::tbb)
+  find_package(TBB CONFIG REQUIRED)
+endif()
 
-  # Simulate include paths
-  include_directories(${CMAKE_CURRENT_LIST_DIR})
 
-  ################
-  #
-  ################
-  set(MACROCELL_SIZE_MIP 4)
-  set(ENABLE_IN_SHADER ON)
-  set(ENABLE_OUT_OF_CORE ON)
-  set(ENABLE_OPENGL ON)
+###############################################################################
+# tiny-cuda-nn — prefer a local checkout (tcnn/ submodule or copy), fall back
+# to FetchContent when the local directory is absent.
+# TCNN_REPOSITORY / TCNN_COMMIT_HASH are injected from pyproject.toml via
+# scikit-build-core's cmake.define table and are used only for the fallback.
+###############################################################################
+set(TCNN_BUILD_BENCHMARK OFF CACHE BOOL "" FORCE)
+set(TCNN_BUILD_EXAMPLES  OFF CACHE BOOL "" FORCE)
+add_subdirectory(tcnn)
+# TCNN's cmake creates the `tiny-cuda-nn` target directly.
+# Newer TCNN versions dropped the TCNN_NAMESPACE macro; inject it so all
+# consumers get it without modifying TCNN source.
+target_compile_definitions(tiny-cuda-nn PUBLIC TCNN_NAMESPACE=tcnn)
 
-  add_subdirectory(core)
-  add_subdirectory(device)
-  add_subdirectory(apps)
+###############################################################################
+# OVR compatibility aliases
+###############################################################################
+if((NOT TARGET util) AND (TARGET rendercommon))
+  add_library(util ALIAS rendercommon)
+endif()
 
+# GFX_LIBRARIES: interactive viewer targets provided by OVR's OpenGL layer.
+# OVR sets this variable; provide a safe fallback for standalone builds.
+if(NOT DEFINED GFX_LIBRARIES)
+  set(GFX_LIBRARIES glfwApp)
 endif()
+
+###############################################################################
+# Global include paths (api.h, api_internal.h live at the project root)
+###############################################################################
+include_directories(${CMAKE_CURRENT_LIST_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+###############################################################################
+# Feature flags (match references/instantvnr original defaults)
+###############################################################################
+set(MACROCELL_SIZE_MIP 4)
+set(ENABLE_IN_SHADER   ON)
+set(ENABLE_OUT_OF_CORE ON)
+set(ENABLE_OPENGL      ${OVR_BUILD_OPENGL})
+
+###############################################################################
+# Subprojects
+###############################################################################
+add_subdirectory(core)
+# device/ is the OVR plugin layer; build it as a shared library so it is
+# co-installed and available for optional OVR integration.
+add_subdirectory(device)
+add_subdirectory(apps)
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 419dde7..61f214f 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -1,36 +1,14 @@
 # ======================================================================== #
-# Copyright 2019-2020 Qi Wu                                                #
-#                                                                          #
-# Licensed under the Apache License, Version 2.0 (the "License");          #
-# you may not use this file except in compliance with the License.         #
-# You may obtain a copy of the License at                                  #
-#                                                                          #
-#     http://www.apache.org/licenses/LICENSE-2.0                           #
-#                                                                          #
-# Unless required by applicable law or agreed to in writing, software      #
-# distributed under the License is distributed on an "AS IS" BASIS,        #
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
-# See the License for the specific language governing permissions and      #
-# limitations under the License.                                           #
+# Copyright 2019-2022 Qi Wu                                                #
+# Licensed under the Apache License, Version 2.0                           #
 # ======================================================================== #
 # ======================================================================== #
 # Copyright 2018-2019 Ingo Wald                                            #
-#                                                                          #
-# Licensed under the Apache License, Version 2.0 (the "License");          #
-# you may not use this file except in compliance with the License.         #
-# You may obtain a copy of the License at                                  #
-#                                                                          #
-#     http://www.apache.org/licenses/LICENSE-2.0                           #
-#                                                                          #
-# Unless required by applicable law or agreed to in writing, software      #
-# distributed under the License is distributed on an "AS IS" BASIS,        #
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
-# See the License for the specific language governing permissions and      #
-# limitations under the License.                                           #
+# Licensed under the Apache License, Version 2.0                           #
 # ======================================================================== #
 
 set(CMAKE_CXX_STANDARD  17)
-set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD 17)
 
 if(NOT target)
   set(target instantvnr)
@@ -39,6 +17,7 @@ endif()
 # ======================================================================== #
 # applications
 # ======================================================================== #
+
 if(ENABLE_OPENGL)
   add_executable(vnr_int_dual int_dual_volume.cpp)
   target_link_libraries(vnr_int_dual   PRIVATE ${target} ${GFX_LIBRARIES})
@@ -53,7 +32,7 @@ target_link_libraries(vnr_cmd_train PRIVATE ${target})
 add_executable(vnr_cmd_render batch_renderer.cpp)
 target_link_libraries(vnr_cmd_render PRIVATE ${target})
 
-if (ENABLE_IN_SHADER)
+if(ENABLE_IN_SHADER)
   add_executable(vnr_cmd_isosurface batch_isosurface.cpp)
   target_link_libraries(vnr_cmd_isosurface PRIVATE ${target})
 endif()
@@ -61,13 +40,13 @@ endif()
 add_executable(view_model view_model.cpp)
 target_link_libraries(view_model PRIVATE ${target})
 
-# OVR application
+# OVR plugin applications — only built when instantvnr is loaded as an OVR device
 if(DEFINED OVR_BUILD_MODULE_NNVOLUME)
 
   add_executable(generate_shadow_map shadowmap.cu)
-  target_link_libraries(generate_shadow_map PUBLIC  renderlib glfwApp $<TARGET_OBJECTS:device_nnvolume_array>)
+  target_link_libraries(generate_shadow_map PUBLIC renderlib glfwApp $<TARGET_OBJECTS:device_nnvolume_array>)
 
-  if (ENABLE_IN_SHADER)
+  if(ENABLE_IN_SHADER)
     add_executable(vnr_int_isosurface int_isosurface.cu)
     target_link_libraries(vnr_int_isosurface PUBLIC renderlib ${target} ${GFX_LIBRARIES})
   endif()
diff --git a/apps/batch_trainer.cpp b/apps/batch_trainer.cpp
index d75b13c..5250a59 100644
--- a/apps/batch_trainer.cpp
+++ b/apps/batch_trainer.cpp
@@ -69,14 +69,21 @@ struct CmdArgs : CmdArgsBase {
 
 namespace vidi {
 enum VoxelType {
-  VOXEL_UINT8  = vnr::VALUE_TYPE_UINT8,
-  VOXEL_INT8   = vnr::VALUE_TYPE_INT8,
-  VOXEL_UINT16 = vnr::VALUE_TYPE_UINT16,
-  VOXEL_INT16  = vnr::VALUE_TYPE_INT16,
-  VOXEL_UINT32 = vnr::VALUE_TYPE_UINT32,
-  VOXEL_INT32  = vnr::VALUE_TYPE_INT32,
-  VOXEL_FLOAT  = vnr::VALUE_TYPE_FLOAT,
-  VOXEL_DOUBLE = vnr::VALUE_TYPE_DOUBLE,
+  VOXEL_UINT8   = vnr::VALUE_TYPE_UINT8,
+  VOXEL_INT8    = vnr::VALUE_TYPE_INT8,
+  VOXEL_UINT16  = vnr::VALUE_TYPE_UINT16,
+  VOXEL_INT16   = vnr::VALUE_TYPE_INT16,
+  VOXEL_UINT32  = vnr::VALUE_TYPE_UINT32,
+  VOXEL_INT32   = vnr::VALUE_TYPE_INT32,
+  VOXEL_FLOAT   = vnr::VALUE_TYPE_FLOAT,
+  VOXEL_FLOAT2  = vnr::VALUE_TYPE_FLOAT2,
+  VOXEL_FLOAT3  = vnr::VALUE_TYPE_FLOAT3,
+  VOXEL_FLOAT4  = vnr::VALUE_TYPE_FLOAT4,
+  VOXEL_DOUBLE  = vnr::VALUE_TYPE_DOUBLE,
+  // multi-channel double types: vnr has no counterpart; use sentinel values
+  VOXEL_DOUBLE2 = 501,
+  VOXEL_DOUBLE3 = 502,
+  VOXEL_DOUBLE4 = 503,
 };
 } // namespace vidi
 #define VIDI_VOLUME_EXTERNAL_TYPE_ENUM
diff --git a/base b/base
new file mode 160000
index 0000000..d66050c
--- /dev/null
+++ b/base
@@ -0,0 +1 @@
+Subproject commit d66050c1e2bcf7755cd70efa86b3efdcf0eaef01
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index cd7608c..581dba8 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -1,43 +1,21 @@
 # ======================================================================== #
-# Copyright 2019-2020 Qi Wu                                                #
-#                                                                          #
-# Licensed under the Apache License, Version 2.0 (the "License");          #
-# you may not use this file except in compliance with the License.         #
-# You may obtain a copy of the License at                                  #
-#                                                                          #
-#     http://www.apache.org/licenses/LICENSE-2.0                           #
-#                                                                          #
-# Unless required by applicable law or agreed to in writing, software      #
-# distributed under the License is distributed on an "AS IS" BASIS,        #
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
-# See the License for the specific language governing permissions and      #
-# limitations under the License.                                           #
+# Copyright 2019-2022 Qi Wu                                                #
+# Licensed under the Apache License, Version 2.0                           #
 # ======================================================================== #
 # ======================================================================== #
 # Copyright 2018-2019 Ingo Wald                                            #
-#                                                                          #
-# Licensed under the Apache License, Version 2.0 (the "License");          #
-# you may not use this file except in compliance with the License.         #
-# You may obtain a copy of the License at                                  #
-#                                                                          #
-#     http://www.apache.org/licenses/LICENSE-2.0                           #
-#                                                                          #
-# Unless required by applicable law or agreed to in writing, software      #
-# distributed under the License is distributed on an "AS IS" BASIS,        #
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
-# See the License for the specific language governing permissions and      #
-# limitations under the License.                                           #
+# Licensed under the Apache License, Version 2.0                           #
 # ======================================================================== #
 
 set(CMAKE_CXX_STANDARD  17)
-set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD 17)
 
 if(NOT target)
   set(target instantvnr)
 endif()
 message(STATUS "target - ${target}")
 
-include(configure_cuda) # skip optix here
+include(configure_cuda) # include_guard(GLOBAL) makes this idempotent
 
 if(DEFINED MACROCELL_SIZE_MIP)
   add_definitions(-DMACROCELL_SIZE_MIP=${MACROCELL_SIZE_MIP})
@@ -117,8 +95,8 @@ if(ENABLE_FVSRN)
   target_compile_definitions(${target} PRIVATE ENABLE_FVSRN=1)
 endif()
 
-set_target_properties(${target} 
-PROPERTIES 
+set_target_properties(${target}
+PROPERTIES
   CXX_STANDARD 17
   CUDA_STANDARD 17
   CUDA_RESOLVE_DEVICE_SYMBOLS ON
@@ -130,15 +108,14 @@ target_include_directories(${target} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE
 target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/..>)
 target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/../..>)
 
-target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${CUDA_NVCC_FLAGS}>)
-target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe="--diag_suppress=177">)   # -> variable declared but not referenced
-target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe="--diag_suppress=20044">) # -> extern variable '...' treated as static
+target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe="--diag_suppress=177">)   # variable declared but not referenced
+target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe="--diag_suppress=20044">) # extern variable treated as static
 
-target_link_libraries(${target} PRIVATE TBB::tbb curand)
+target_link_libraries(${target} PRIVATE TBB::tbb curand tiny-cuda-nn)
+target_link_libraries(${target} PUBLIC gdt util tfnmodule)
 
-target_link_libraries(${target} PUBLIC # intentionally keep these library public
-  gdt util tfnmodule tiny-cuda-nn
-)
+# GLIBCXX ABI compile definition — must match the installed PyTorch
+target_compile_definitions(${target} PUBLIC ${IVNR_GLIBCXX_CXX11_ABI})
 
 if(TARGET openvkl::openvkl AND TARGET openvkl::testing)
   target_compile_definitions(${target} PRIVATE ENABLE_OPENVKL=1)
@@ -148,8 +125,3 @@ endif()
 if(ENABLE_LOGGING)
   target_compile_definitions(${target} PRIVATE ENABLE_LOGGING=1)
 endif()
-
-# target_include_directories(vnr_core PUBLIC $<$<PLATFORM_ID:Windows>:${INTEL_AIO_INC}>)
-# target_link_libraries     (vnr_core PUBLIC $<$<PLATFORM_ID:Windows>:${INTEL_AIO_LIB}>)
-# target_compile_definitions(vnr_core PUBLIC AIO_INTEL)
-# target_compile_definitions(vnr_core PRIVATE TCNN_NAMESPACE=tcnn)
diff --git a/core/network.cu b/core/network.cu
index c6f7eda..64077e8 100644
--- a/core/network.cu
+++ b/core/network.cu
@@ -41,49 +41,8 @@
 #include <vector>
 #include <ctime>
 
-// make a private version of thrust::plus to avoid template instantiation conflicts ...
-namespace {
-
-template<typename T>
-struct maximum_op {
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef T result_type;
-  __host__ __device__ constexpr T operator()(const T& lhs, const T& rhs) const { return lhs < rhs ? rhs : lhs; }
-}; // end maximum
-
-template<typename T>
-struct minimum_op {
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef T result_type;
-  __host__ __device__ constexpr T operator()(const T& lhs, const T& rhs) const { return lhs < rhs ? lhs : rhs; }
-}; // end minimum
-
-template<typename T>
-struct plus {
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef T result_type;
-  __host__ __device__ constexpr T operator()(const T &lhs, const T &rhs) const { return lhs + rhs; }
-}; // end plus
-
-template<typename T>
-T parallel_sum_gpu(const T* __restrict__ data, size_t count, cudaStream_t stream = nullptr) {
-  const auto begin = thrust::device_ptr<const T>(data);
-  const auto end = begin + count;
-  return thrust::reduce(thrust::cuda::par.on(stream), begin, end, T(0), plus<T>());
-}
-
-template<typename T>
-void parallel_minmax_gpu(const T* __restrict__ data, size_t count, T& init_min, T& init_max, cudaStream_t stream = nullptr) {
-  const auto begin = thrust::device_ptr<const T>(data);
-  const auto end = begin + count;
-  init_min = thrust::reduce(thrust::cuda::par.on(stream), begin, end, init_min, minimum_op<T>());
-  init_max = thrust::reduce(thrust::cuda::par.on(stream), begin, end, init_max, maximum_op<T>());
-}
-
-}
+// These helpers (maximum_op, minimum_op, plus, parallel_sum_gpu, parallel_minmax_gpu)
+// are now provided by evaluation_kernel.h (ovr/ovr/common/).
 
 namespace vnr {
 
diff --git a/core/samplers/neural_sampler.cpp b/core/samplers/neural_sampler.cpp
index 2540394..1db249e 100644
--- a/core/samplers/neural_sampler.cpp
+++ b/core/samplers/neural_sampler.cpp
@@ -2,14 +2,21 @@
 
 namespace vidi {
 enum VoxelType {
-  VOXEL_UINT8  = vnr::VALUE_TYPE_UINT8,
-  VOXEL_INT8   = vnr::VALUE_TYPE_INT8,
-  VOXEL_UINT16 = vnr::VALUE_TYPE_UINT16,
-  VOXEL_INT16  = vnr::VALUE_TYPE_INT16,
-  VOXEL_UINT32 = vnr::VALUE_TYPE_UINT32,
-  VOXEL_INT32  = vnr::VALUE_TYPE_INT32,
-  VOXEL_FLOAT  = vnr::VALUE_TYPE_FLOAT,
-  VOXEL_DOUBLE = vnr::VALUE_TYPE_DOUBLE,
+  VOXEL_UINT8   = vnr::VALUE_TYPE_UINT8,
+  VOXEL_INT8    = vnr::VALUE_TYPE_INT8,
+  VOXEL_UINT16  = vnr::VALUE_TYPE_UINT16,
+  VOXEL_INT16   = vnr::VALUE_TYPE_INT16,
+  VOXEL_UINT32  = vnr::VALUE_TYPE_UINT32,
+  VOXEL_INT32   = vnr::VALUE_TYPE_INT32,
+  VOXEL_FLOAT   = vnr::VALUE_TYPE_FLOAT,
+  VOXEL_FLOAT2  = vnr::VALUE_TYPE_FLOAT2,
+  VOXEL_FLOAT3  = vnr::VALUE_TYPE_FLOAT3,
+  VOXEL_FLOAT4  = vnr::VALUE_TYPE_FLOAT4,
+  VOXEL_DOUBLE  = vnr::VALUE_TYPE_DOUBLE,
+  // multi-channel double types: vnr has no counterpart; use sentinel values
+  VOXEL_DOUBLE2 = 501,
+  VOXEL_DOUBLE3 = 502,
+  VOXEL_DOUBLE4 = 503,
 };
 } // namespace vidi
 #define VIDI_VOLUME_EXTERNAL_TYPE_ENUM
diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt
index 683a5c7..f527170 100644
--- a/device/CMakeLists.txt
+++ b/device/CMakeLists.txt
@@ -1,10 +1,13 @@
 ################
-#
+# device_nnvolume_array — OBJECT library used by the shadow-map app and as a
+# build unit for the device plugin.  Standalone builds still compile it so the
+# full feature set is preserved; it is only *linked* when OVR_BUILD_MODULE_NNVOLUME
+# is defined (i.e., when building as part of the OVR plugin framework).
 ################
-add_library(device_nnvolume_array OBJECT 
+add_library(device_nnvolume_array OBJECT
   device_nnvolume_array.cpp
 )
-set_target_properties(device_nnvolume_array 
+set_target_properties(device_nnvolume_array
 PROPERTIES
   CUDA_RESOLVE_DEVICE_SYMBOLS ON
   CUDA_SEPARABLE_COMPILATION ON
@@ -13,24 +16,33 @@ PROPERTIES
 target_link_libraries(device_nnvolume_array PRIVATE rendercommon TBB::tbb)
 
 ################
-#
+# device_nnvolume — standalone build uses SHARED so the library is installed
+# alongside other .so files.  In OVR plugin mode the original MODULE type is
+# appropriate; use the INSTANTVNR_STANDALONE cache variable to switch.
 ################
-add_library(device_nnvolume MODULE
+option(INSTANTVNR_STANDALONE "Build device_nnvolume as SHARED (pip package) instead of MODULE (OVR plugin)" ON)
+
+if(INSTANTVNR_STANDALONE)
+  set(_dev_type SHARED)
+else()
+  set(_dev_type MODULE)
+endif()
+
+add_library(device_nnvolume ${_dev_type}
   $<TARGET_OBJECTS:device_nnvolume_array>
   device.cpp
   device_impl.cpp
-  # method_shadowmap.cu
+  # method_shadowmap.cu  # uncomment to include shadow-map GPU code
 )
-set_target_properties(device_nnvolume 
+set_target_properties(device_nnvolume
 PROPERTIES
   CUDA_RESOLVE_DEVICE_SYMBOLS ON
   CUDA_SEPARABLE_COMPILATION ON
 )
 
-################
-#
-################
 target_link_libraries     (device_nnvolume PRIVATE rendercommon TBB::tbb instantvnr)
 target_compile_options    (device_nnvolume PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${CUDA_NVCC_FLAGS}>)
-target_compile_definitions(device_nnvolume PRIVATE OVR_BUILD_MODULE_NNVOLUME=1)
-target_compile_definitions(device_nnvolume PRIVATE MACROCELL_SIZE_MIP=${MACROCELL_SIZE_MIP})
+target_compile_definitions(device_nnvolume PRIVATE
+  ${IVNR_GLIBCXX_CXX11_ABI}
+  MACROCELL_SIZE_MIP=${MACROCELL_SIZE_MIP}
+)
diff --git a/device/device_impl.cpp b/device/device_impl.cpp
index 83160c7..cde89a6 100644
--- a/device/device_impl.cpp
+++ b/device/device_impl.cpp
@@ -163,7 +163,7 @@ DeviceNNVolume::Impl::commit()
   /* commit other data */
   if (parent->params.camera.update()) {
     const auto& camera = parent->params.camera.ref();
-    camera_latest = vnr::Camera{ camera.from, camera.at, camera.up };
+    camera_latest = vnr::Camera{ camera.eye, camera.at, camera.up };
     framebuffer_reset = true;
   }
 
diff --git a/setup_cmake.sh b/setup_cmake.sh
new file mode 100644
index 0000000..2298129
--- /dev/null
+++ b/setup_cmake.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Configure and build the C++ instantvnr libraries and executables directly
+# via CMake, without going through Python packaging.
+#
+# Usage:
+#   ./setup_cmake.sh               # auto-detect everything
+#   SM=86 ./setup_cmake.sh         # override GPU arch
+#   BUILD_DIR=build ./setup_cmake.sh  # custom build directory
+#   ./setup_cmake.sh --configure   # configure only (skip build)
+#   ./setup_cmake.sh --build       # build only (skip configure)
+#
+# Requires:
+#   - CUDA toolkit (nvcc in PATH or /usr/local/cuda)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BUILD_DIR="${BUILD_DIR:-${SCRIPT_DIR}/build}"
+
+DO_CONFIGURE=true
+DO_BUILD=true
+for arg in "$@"; do
+  case "$arg" in
+    --configure) DO_BUILD=false ;;
+    --build)     DO_CONFIGURE=false ;;
+  esac
+done
+
+# ── detect GPU SM ─────────────────────────────────────────────────────────────
+if [[ -n "${SM:-}" ]]; then
+    echo "[info] Using SM=$SM from environment"
+elif command -v nvidia-smi &>/dev/null; then
+    SM=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
+    echo "[info] Detected GPU sm_$SM"
+else
+    echo "[warn] nvidia-smi not found — defaulting to native arch detection"
+    SM="native"
+fi
+
+# ── detect CUDA toolkit ───────────────────────────────────────────────────────
+if   command -v nvcc &>/dev/null;       then CUDA_HOME="$(realpath "$(dirname "$(command -v nvcc)")/..")"
+elif [[ -x /usr/local/cuda/bin/nvcc ]]; then CUDA_HOME="/usr/local/cuda"
+else
+    echo "[error] nvcc not found — CUDA toolkit is required" >&2
+    exit 1
+fi
+echo "[info] CUDA_HOME: $CUDA_HOME"
+
+export PATH="$CUDA_HOME/bin${PATH:+:$PATH}"
+export LD_LIBRARY_PATH="$CUDA_HOME/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
+
+# ── configure ─────────────────────────────────────────────────────────────────
+if [[ "$DO_CONFIGURE" == true ]]; then
+    echo "[info] Configuring in $BUILD_DIR (SM=$SM)"
+    cmake -S "$SCRIPT_DIR" -B "$BUILD_DIR" \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_CUDA_ARCHITECTURES="$SM"
+fi
+
+# ── build ─────────────────────────────────────────────────────────────────────
+if [[ "$DO_BUILD" == true ]]; then
+    JOBS="${JOBS:-$(nproc)}"
+    echo "[info] Building with $JOBS parallel jobs"
+    cmake --build "$BUILD_DIR" --config Release -- -j"$JOBS"
+    echo "[info] Build complete. Outputs in $BUILD_DIR/instantvnr/"
+fi
diff --git a/tcnn b/tcnn
index 5bd6b38..71c6afb 160000
--- a/tcnn
+++ b/tcnn
@@ -1 +1 @@
-Subproject commit 5bd6b38dead69e74f7233ec69a8f3bc043ae2a1d
+Subproject commit 71c6afbc5202ccf8de6dfdf612065cf7f941cb74

From bc3d4676ee029513ffee7a1cd669f01ad75bf630 Mon Sep 17 00:00:00 2001
From: Qi Wu <qiwu@nvidia.com>
Date: Sat, 18 Apr 2026 22:51:33 -0700
Subject: [PATCH 2/5] Refactor CMake configuration and remove shadow map
 components

- Updated the staging directory in CMakeLists.txt to point to a new bin directory.
- Removed shadow map related executables and files from the apps directory, including shadowmap.cu and shadowmap.usda.
- Adjusted the CMake configuration to exclude shadow map methods from the device library.
---
 CMakeLists.txt             |   2 +-
 apps/CMakeLists.txt        |  14 +-
 apps/int_isosurface.cu     |   2 +-
 apps/shadowmap.cu          | 686 ---------------------------------
 apps/shadowmap.usda        |  48 ---
 device/CMakeLists.txt      |   1 -
 device/method_shadowmap.cu | 759 -------------------------------------
 device/method_shadowmap.h  |  24 --
 8 files changed, 5 insertions(+), 1531 deletions(-)
 delete mode 100644 apps/shadowmap.cu
 delete mode 100644 apps/shadowmap.usda
 delete mode 100644 device/method_shadowmap.cu
 delete mode 100644 device/method_shadowmap.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6cd71c..18f6efb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,7 +42,7 @@ message(STATUS "GLIBCXX_CXX11_ABI: ${IVNR_GLIBCXX_USE_CXX11_ABI}")
 # Staging directory — all shared libs and executables land here so the install
 # step can copy them as a single package directory (same pattern as pysampler).
 ###############################################################################
-set(_ivnr_staging "${CMAKE_BINARY_DIR}/instantvnr")
+set(_ivnr_staging "${CMAKE_BINARY_DIR}/bin")
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${_ivnr_staging}")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${_ivnr_staging}")
 set(CMAKE_INSTALL_RPATH              "$ORIGIN")
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 61f214f..b5d1e33 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -40,15 +40,7 @@ endif()
 add_executable(view_model view_model.cpp)
 target_link_libraries(view_model PRIVATE ${target})
 
-# OVR plugin applications — only built when instantvnr is loaded as an OVR device
-if(DEFINED OVR_BUILD_MODULE_NNVOLUME)
-
-  add_executable(generate_shadow_map shadowmap.cu)
-  target_link_libraries(generate_shadow_map PUBLIC renderlib glfwApp $<TARGET_OBJECTS:device_nnvolume_array>)
-
-  if(ENABLE_IN_SHADER)
-    add_executable(vnr_int_isosurface int_isosurface.cu)
-    target_link_libraries(vnr_int_isosurface PUBLIC renderlib ${target} ${GFX_LIBRARIES})
-  endif()
-
+if(ENABLE_IN_SHADER)
+  add_executable(vnr_int_isosurface int_isosurface.cu)
+  target_link_libraries(vnr_int_isosurface PUBLIC renderlib ${target} ${GFX_LIBRARIES})
 endif()
diff --git a/apps/int_isosurface.cu b/apps/int_isosurface.cu
index c1f0d49..46a6c48 100644
--- a/apps/int_isosurface.cu
+++ b/apps/int_isosurface.cu
@@ -207,7 +207,7 @@ public:
              const float scale,
              int width,
              int height)
-    : GLFCameraWindow(title, camera.from, camera.at, camera.up, scale, width, height)
+    : GLFCameraWindow(title, camera.eye, camera.at, camera.up, scale, width, height)
     , async_rendering_loop(std::bind(&MainWindow::render_background, this))
     , renderer(renderer)
     , volume(volume)
diff --git a/apps/shadowmap.cu b/apps/shadowmap.cu
deleted file mode 100644
index 8b9a7e0..0000000
--- a/apps/shadowmap.cu
+++ /dev/null
@@ -1,686 +0,0 @@
-// TODO this file is incomplete
-
-#include "cmdline.h"
-
-#include "device/device_nnvolume_array.h"
-
-#include <ovr/common/math_def.h>
-#include <ovr/common/random/random.h>
-#include <ovr/common/dylink/Library.h>
-#include <ovr/scene.h>
-#include <ovr/serializer/serializer.h>
-
-#include <cuda/cuda_buffer.h>
-
-#define TFN_MODULE_EXTERNAL_VECTOR_TYPES
-namespace tfn {
-typedef ovr::math::vec2f vec2f;
-typedef ovr::math::vec2i vec2i;
-typedef ovr::math::vec3f vec3f;
-typedef ovr::math::vec3i vec3i;
-typedef ovr::math::vec4f vec4f;
-typedef ovr::math::vec4i vec4i;
-} // namespace tfn
-#include <tfn/core.h>
-
-#include <iostream>
-#include <fstream>
-#include <random>
-
-#define inf float_large
-#define float_large 1e31f
-#define float_small 1e-31f
-#define nearly_one 0.9999f
-
-
-// to avoid crazy formatting and indentations
-#define NAMESPACE_BEGIN namespace ovr { namespace nnvolume {
-#define NAMESPACE_END }}
-
-
-NAMESPACE_BEGIN
-
-using namespace ovr::math;
-using ovr::random::RandomTEA;
-
-struct Light
-{
-    vec3f direction;
-    float intensity;
-};
-
-struct DeviceVolume
-{
-    Array3DScalarCUDA volume;
-    DeviceTransferFunction tfn;
-    float step = 1.f;
-    float step_rcp = 1.f; // GPU cacne to avoid recomputation
-
-    box3f bbox = box3f(vec3f(0), vec3f(1)); // with respect to [0-1]^3
-    affine3f transform;
-
-    int n_lights = 0;
-    Light *lights{ nullptr };
-
-    int ao_samples = 1;
-};
-
-struct HostVolume
-{
-    DeviceVolume device;
-    CUDABuffer device_buffer;
-
-    affine3f matrix;
-    
-    std::vector<vec4f> tfn_colors_data;
-    std::vector<float> tfn_alphas_data;
-    vec2f original_value_range;
-
-    std::vector<Light> lights;
-    CUDABuffer lights_buffer;
-
-public:
-    void load_from_array3d_scalar(array_3d_scalar_t array, float data_value_min = 1, float data_value_max = -1)
-    {
-        Array3DScalarCUDA &output = device.volume;
-        output = CreateArray3DScalarCUDA(array);
-        original_value_range.x = output.lower.v;
-        original_value_range.y = output.upper.v;
-        std::cout << "[Shadow Map] volume range = " << original_value_range.x << " " << original_value_range.y << std::endl;
-        set_value_range(data_value_min, data_value_max);
-    }
-
-    void set_transfer_function(Array1DFloat4CUDA c, Array1DScalarCUDA a, vec2f r)
-    {
-        device.tfn.color = c;
-        device.tfn.opacity = a;
-        set_value_range(r.x, r.y);
-        CUDA_SYNC_CHECK();
-    }
-
-    void set_transfer_function(array_1d_float4_t c, array_1d_scalar_t a, vec2f r)
-    {
-        set_transfer_function(CreateArray1DFloat4CUDA(c), CreateArray1DScalarCUDA(a), r);
-    }
-
-    void set_transfer_function(const std::vector<float> &c, const std::vector<float> &o, const vec2f &r)
-    {
-        tfn_colors_data.resize(c.size() / 3);
-        for (int i = 0; i < tfn_colors_data.size(); ++i) {
-            tfn_colors_data[i].x = c[3 * i + 0];
-            tfn_colors_data[i].y = c[3 * i + 1];
-            tfn_colors_data[i].z = c[3 * i + 2];
-            tfn_colors_data[i].w = 1.f;
-        }
-
-        tfn_alphas_data.resize(o.size() / 2);
-        for (int i = 0; i < tfn_alphas_data.size(); ++i) {
-            tfn_alphas_data[i] = o[2 * i + 1];
-        }
-
-        if (!tfn_colors_data.empty() && !tfn_alphas_data.empty()) {
-            set_transfer_function(CreateArray1DFloat4CUDA(tfn_colors_data), CreateArray1DScalarCUDA(tfn_alphas_data), r);
-        }
-        CUDA_SYNC_CHECK();
-    }
-
-    void set_value_range(float data_value_min, float data_value_max)
-    {
-        Array3DScalarCUDA &volume = device.volume;
-        if (data_value_max >= data_value_min)
-        {
-            float normalized_max = integer_normalize(data_value_max, volume.type);
-            float normalized_min = integer_normalize(data_value_min, volume.type);
-            volume.upper.v = normalized_max; // should use the transfer function value range here
-            volume.lower.v = normalized_min;
-        }
-        volume.scale.v = 1.f / (volume.upper.v - volume.lower.v);
-        // Need calculation on max opacity
-        auto r_x = max(original_value_range.x, volume.lower.v);
-        auto r_y = min(original_value_range.y, volume.upper.v);
-        device.tfn.value_range.y = r_y;
-        device.tfn.value_range.x = r_x;
-        device.tfn.range_rcp_norm = 1.f / (device.tfn.value_range.y - device.tfn.value_range.x);
-    }
-
-    void load_lights(const Scene &scene)
-    {
-        lights.clear();
-        for (auto& li : scene.lights) {
-            if (li.type == scene::Light::DIRECTIONAL) {
-                lights.emplace_back(Light{
-                    /*.direction =*/ li.directional.direction,
-                    /*.intensity =*/ li.intensity
-                });
-            }
-        }
-
-        lights_buffer.alloc_and_upload(lights);
-
-        device.n_lights = (int)lights.size();
-        device.lights  = (Light *)lights_buffer.d_pointer();
-    }
-
-};
-
-void commit(const Scene &scene, HostVolume &volume)
-{
-    auto& sv = ovr::parse_single_volume_scene(scene, scene::Volume::STRUCTURED_REGULAR_VOLUME).structured_regular;
-    auto& st = scene.instances[0].models[0].volume_model.transfer_function;
-
-    vec3f scale     = sv.grid_spacing * vec3f(sv.data->dims);
-    vec3f translate = sv.grid_origin;
-
-    volume.matrix = affine3f::translate(translate) * affine3f::scale(scale);
-    volume.device.transform = volume.matrix;
-
-    volume.load_from_array3d_scalar(sv.data);
-    volume.set_transfer_function(CreateArray1DFloat4CUDA(st.color), CreateArray1DScalarCUDA(st.opacity), st.value_range);
-
-    volume.device.step = 1.f / scene.volume_sampling_rate;
-    volume.device.step_rcp = scene.volume_sampling_rate;
-
-    volume.device.ao_samples = scene.ao_samples;
-
-    volume.load_lights(scene);
-
-    // call this in the end
-    volume.device_buffer.resize(sizeof(volume.device));
-    volume.device_buffer.upload(&volume.device, 1);
-}
-
-static __device__ bool
-intersect_box(float &_t0, float &_t1, const vec3f ray_ori, const vec3f ray_dir, vec3f &box_lower, vec3f &box_upper)
-{
-    const vec3f &lower = box_lower;
-    const vec3f &upper = box_upper;
-
-    float t0 = _t0;
-    float t1 = _t1;
-#if 1
-    const vec3i is_small =
-        vec3i(fabs(ray_dir.x) < float_small, fabs(ray_dir.y) < float_small, fabs(ray_dir.z) < float_small);
-    const vec3f rcp_dir = /* ray direction reciprocal*/ 1.f / ray_dir;
-    const vec3f t_lo = vec3f(is_small.x ? float_large : (lower.x - ray_ori.x) * rcp_dir.x, //
-                             is_small.y ? float_large : (lower.y - ray_ori.y) * rcp_dir.y, //
-                             is_small.z ? float_large : (lower.z - ray_ori.z) * rcp_dir.z  //
-    );
-    const vec3f t_hi = vec3f(is_small.x ? -float_large : (upper.x - ray_ori.x) * rcp_dir.x, //
-                             is_small.y ? -float_large : (upper.y - ray_ori.y) * rcp_dir.y, //
-                             is_small.z ? -float_large : (upper.z - ray_ori.z) * rcp_dir.z  //
-    );
-    t0 = max(t0, reduce_max(min(t_lo, t_hi)));
-    t1 = min(t1, reduce_min(max(t_lo, t_hi)));
-#else
-    const vec3f t_lo = (lower - ray_ori) / ray_dir;
-    const vec3f t_hi = (upper - ray_ori) / ray_dir;
-    t0 = max(t0, reduce_max(min(t_lo, t_hi)));
-    t1 = min(t1, reduce_min(max(t_lo, t_hi)));
-#endif
-    _t0 = t0;
-    _t1 = t1;
-    return t1 > t0;
-}
-
-template <typename T>
-__forceinline__ __device__ T lerp(float r, const T &a, const T &b)
-{
-    return (1 - r) * a + r * b;
-}
-
-template <typename T, int N>
-static __device__ T
-array1d_nodal(const ArrayCUDA<1, N> &array, float v)
-{
-    float t = (0.5f + v * (array.dims.v - 1)) / array.dims.v;
-    return tex1D<T>(array.data, t);
-}
-
-static __device__ float
-sample_volume(const Array3DScalarCUDA &self, vec3f p)
-{
-    /* sample volume in object space [0, 1] */
-    p.x = clamp(p.x, 0.f, 1.f);
-    p.y = clamp(p.y, 0.f, 1.f);
-    return tex3D<float>(self.data, p.x, p.y, p.z);
-}
-
-static __device__ void
-sample_transfer_function(const DeviceTransferFunction &tfn, float sampleValue, vec3f &_sampleColor, float &_sampleAlpha)
-{
-    const auto v = (clamp(sampleValue, tfn.value_range.x, tfn.value_range.y) - tfn.value_range.x) * tfn.range_rcp_norm;
-    vec4f rgba = array1d_nodal<float4>(tfn.color, v);
-    rgba.w = array1d_nodal<float>(tfn.opacity, v); // followed by the alpha correction
-    _sampleColor = vec3f(rgba);
-    _sampleAlpha = rgba.w;
-}
-
-static __device__ void
-opacity_correction(const DeviceVolume &self, const float &distance, float &opacity)
-{
-    opacity = 1.f - __powf(1.f - opacity, 2.f * self.step_rcp * distance);
-}
-
-template <typename F>
-__device__ void
-ray_marching_iterator(const float tMin, const float tMax,
-                      const float step, const F &body,
-                      bool debug = false)
-{
-    vec2f t = vec2f(tMin, min(tMax, tMin + step));
-    while ((t.y > t.x) && body(t))
-    {
-        t.x = t.y;
-        t.y = min(t.x + step, tMax);
-    }
-}
-
-__device__ float
-ray_marching_transmittance(const DeviceVolume &self,
-                           const vec3f org,
-                           const vec3f dir,
-                           RandomTEA &rng)
-{
-    const auto marching_step = self.step;
-
-    float alpha = 0.f;
-    float t0 = 0.f, t1 = inf;
-
-    vec3f lower_end = vec3f(0.f);
-    vec3f upper_end = vec3f(1.f);
-
-    if (!intersect_box(t0, t1, org, dir, lower_end, upper_end)) return 1.f;
-
-    // jitter ray to remove ringing effects
-    const float jitter = rng.get_floats().x;
-
-    // start marching
-    ray_marching_iterator(t0, t1, marching_step, [&](const vec2f &t) {
-        // sample data value
-        const auto p = org + lerp(jitter, t.x, t.y) * dir; // object space position
-        const auto sampleValue = sample_volume(self.volume, p);
-        // classification
-        vec3f sampleColor;
-        float sampleAlpha;
-        sample_transfer_function(self.tfn, sampleValue, sampleColor, sampleAlpha);
-        opacity_correction(self, t.y - t.x, sampleAlpha);
-        // blending
-        alpha += (1.f - alpha) * sampleAlpha;
-        return alpha < nearly_one; 
-    });
-
-    return 1.f - alpha;
-}
-
-__global__ void
-ray_marching_kernel(const vec3i dims, const void *ptr, float *__restrict__ shadowbuffer)
-{
-    // 3D kernel launch
-    vec3i  voxel_coord = vec3i(threadIdx.x + blockIdx.x * blockDim.x, threadIdx.y + blockIdx.y * blockDim.y, threadIdx.z + blockIdx.z * blockDim.z);
-    size_t voxel_index = voxel_coord.x + voxel_coord.y * (size_t)dims.x + voxel_coord.z * (size_t)dims.y * (size_t)dims.x;
-
-    if (voxel_index > dims.long_product()) return;
-
-    // generate ray & payload
-    RandomTEA rng(voxel_index, 0);
-
-    // voxel center in local coordinate (0-1)^3 get the object to world transformation
-    const DeviceVolume &self = *((DeviceVolume *)ptr);
-
-    const affine3f otw = self.transform;
-    const affine3f wto = otw.inverse();
-    
-    const vec3f org = (vec3f(voxel_coord) + vec3f(0.5f, 0.5f, 0.5f)) / vec3f(dims); // transform to object space
-
-    float shadow = 0.f;
-    for (int i = 0; i < self.n_lights; ++i)
-    {
-        auto li = self.lights[i]; // copy to register, intentional
-
-        vec3f li_dir = normalize(li.direction);
-        float li_val = li.intensity;
-
-        float li_shadow = 0.f;
-        for (int spv = 0; spv < self.ao_samples; ++spv)
-        {
-            li_shadow += ray_marching_transmittance(self, org, xfmVector(wto, li_dir), rng); // transform to object space
-        }
-
-        shadow += (li_shadow / self.ao_samples) * li_val;
-    }
-
-    shadowbuffer[voxel_index] = shadow;
-}
-
-NAMESPACE_END
-
-using namespace ovr;
-using namespace ovr::nnvolume;
-
-struct CmdArgs : CmdArgsBase {
-public:
-    args::ArgumentParser parser;
-    args::HelpFlag help;
-
-    args::Positional<std::string> m_scene;
-    std::string scene() { return args::get(m_scene); }
-
-    // optional
-
-    args::ValueFlag<float> m_sampling_rate;
-    float sampling_rate() { return (m_sampling_rate) ? args::get(m_sampling_rate) : 1.f; }
-
-    args::ValueFlag<int> m_shadow_samples;
-    int shadow_samples() { return (m_shadow_samples) ? args::get(m_shadow_samples) : 1; }
-
-    args::ValueFlag<std::string> m_output;
-    std::string output() { return (m_output) ? args::get(m_output) : "shadowmap"; }
-
-    // group for random lights
-
-    args::Group group_random_lights;
-
-    args::ValueFlag<int> m_random_lights;
-    bool random_lights() { return (m_random_lights); }
-    int num_random_lights() { return (m_random_lights) ?  args::get(m_random_lights) : 0; }
-
-    args::Flag m_random_tfn;
-    bool random_tfn() { return (m_random_tfn); }
-
-    // Ring light parsing
-
-    args::Group group_ring_lights;
-
-    args::ValueFlag<int> m_ring_lights;
-    bool ring_lights() { return (m_ring_lights); }
-    int num_ring_lights() { return (m_ring_lights) ?  args::get(m_ring_lights) : 0; }
-
-    args::ValueFlag<float> m_theta;
-    float value_theta() { return (m_theta) ? args::get(m_theta) : 0.f; }
-
-    args::ValueFlag<float> m_phi;
-    float value_phi() { return (m_phi) ? args::get(m_phi) : 0.f; }
-
-public:
-    CmdArgs(const char *title, int argc, char **argv)
-        : parser(title)
-        , help(parser, "help", "display the help menu", {'h', "help"})
-        , m_scene(parser, "string", "the scene to render")
-        , m_sampling_rate(parser, "float", "ray marching sampling rate", {"sampling-rate"})
-        , m_shadow_samples(parser, "int", "number of samples per voxel", {"shadow-samples"})
-        , m_output(parser, "string", "output name", {"output"})
-        , group_random_lights(parser, "random light group:", args::Group::Validators::AllOrNone)
-        , m_random_lights(group_random_lights, "int", "generate N random lights", {"random-lights"})
-        , m_random_tfn(parser, "flag", "generate a random transfer function", {"random-tfn"})
-        , group_ring_lights(parser, "ring light group:", args::Group::Validators::AllOrNone)
-        , m_ring_lights(group_ring_lights, "int", "Generate N lights in a ring", {"ring-lights"})
-        , m_theta(parser, "int", "ring theta", {"theta"})
-        , m_phi(parser, "int", "ring phi", {"phi"})
-    {
-        exec(parser, argc, argv);
-    }
-};
-
-int main(int ac, char **av)
-{
-    CmdArgs args("Shadow Volume Generator", ac, av);
-
-    // Create scene + volume + tfn + lights
-    Scene scene = scene::create_scene(args.scene());
-    scene.volume_sampling_rate = args.sampling_rate();
-    scene.ao_samples = args.shadow_samples();
-
-    auto &scene_tfn = scene.instances[0].models[0].volume_model.transfer_function;
-
-    // generate random lights
-    srand((unsigned int)time(NULL)); // Initialization, should only be called once.
-
-    if (args.random_lights()) {
-
-        scene.lights.clear();
-
-        for (int i = 0; i < args.num_random_lights(); ++i) {
-            std::cout << "Generate Light #" << i << std::endl;
-
-            scene::Light light;
-            light.type = scene::Light::DIRECTIONAL;
-
-            float theta = 2.0f * (float)M_PI * ((float)rand() / (float)(RAND_MAX));
-            float phi   = 1.0f * (float)M_PI * ((float)rand() / (float)(RAND_MAX));
-
-            // Generate Direction
-            float x = 1.0f * cos(phi) * sin(theta);
-            float y = 1.0f * sin(phi) * sin(theta);
-            float z = 1.0f * cos(theta);
-            light.directional.direction = normalize(vec3f(x, y, z));
-            std::cout << "Light Direction: " << light.directional.direction.x << " " << light.directional.direction.y << " " << light.directional.direction.z << " " << std::endl;
-
-            // Generate Color
-            light.intensity = 1.f / args.num_random_lights();
-
-            // Store
-            scene.lights.push_back(light);
-        }
-
-    }
-
-    // Ring light generation
-    else if (args.ring_lights()){
-        std::cout << "Num Lights: " << args.num_ring_lights() << std::endl;
-        std::cout << "Theta: " << args.value_theta() << std::endl;
-        std::cout << "Phi: " << args.value_phi() << std::endl << std::endl;
-
-        scene.lights.clear();
-
-        for (int i = 0; i < args.num_ring_lights(); ++i) {
-            std::cout << "Generate Light #" << i << std::endl;
-
-            scene::Light light;
-            light.type = scene::Light::DIRECTIONAL;
-
-            // Theta and phi are passed in as deg
-            float theta = (float)M_PI/180.f * args.value_theta();
-
-            float phi_offset = 360.f / args.num_ring_lights();
-            float phi   = (float)M_PI/180.f * (args.value_phi() + i * phi_offset);
-
-            // Generate Direction
-            float x = 1.0f * cos(phi) * sin(theta);
-            float y = 1.0f * sin(phi) * sin(theta);
-            float z = 1.0f * cos(theta);
-            light.directional.direction = normalize(vec3f(x, y, z));
-            std::cout << "Light Direction: " << light.directional.direction.x << " " << light.directional.direction.y << " " << light.directional.direction.z << " " << std::endl;
-
-            // Generate Color
-            light.intensity = 1.f / args.num_ring_lights();
-
-            // Store
-            scene.lights.push_back(light);
-        }
-
-    }
-
-
-    // create a transfer function object
-    tfn::TransferFunctionCore tfn(1024);
-    range1f range;
-    {
-        vec4f* color_data = scene_tfn.color->data_typed<vec4f>();
-        float* alpha_data = scene_tfn.opacity->data_typed<float>();
-        for (int i = 0; i < scene_tfn.color->size(); ++i) {
-            auto color = color_data[i];
-            float pos = (float)i / (scene_tfn.color->size() - 1);
-            tfn.addColorControl(tfn::TransferFunctionCore::ColorControl(pos, color.xyz()));
-        }
-        for (int i = 0; i < scene_tfn.color->size(); ++i) {
-            auto alpha = alpha_data[i];
-            float pos = (float)i / (scene_tfn.color->size() - 1);
-            tfn.addAlphaControl(vec2f(pos, alpha));
-        }
-
-        range.lower = scene_tfn.value_range.x;
-        range.upper = scene_tfn.value_range.y;
-    }
-
-    if (args.random_tfn()) {
-        tfn.clearAlphaControls();
-
-        // Create Number of Gaussian, 1 to 10
-        const int num_gaussian = (int)(((float)rand() / (float)(RAND_MAX) + 1) * 5);
-        std::cout << "Generate Gaussian #" << num_gaussian << std::endl;
-
-        for (int each_gaussian = 0; each_gaussian < num_gaussian; ++each_gaussian)
-        {
-            float gaussian_mean   = (float)rand() / (float)(RAND_MAX);
-            float gaussian_sigma  = max(0.2f * (float)rand() / (float)(RAND_MAX), 0.0001f);
-            float gaussian_height = max((gaussian_sigma * std::sqrt(2.0f * float(M_PI))) * (float)rand() / (float)(RAND_MAX), 0.0001f);
-
-            std::cout << "Generate Gaussian Mean: " << gaussian_mean << std::endl;
-            std::cout << "Generate Gaussian Height: " << gaussian_height << std::endl;
-            std::cout << "Generate Gaussian Sigma: " << gaussian_sigma << std::endl;
-
-            tfn.addGaussianObject(gaussian_mean, gaussian_sigma, gaussian_height);
-        }
-
-        // It seems we do not need to explicitly normalize gaussians
-    }
-
-    // overwrite the scene tfn
-    tfn.updateColorMap();
-    {
-        auto* table = (vec4f*)tfn.data();
-        std::vector<vec4f> color(tfn.resolution());
-        std::vector<float> alpha(tfn.resolution());
-        for (int i = 0; i < tfn.resolution(); ++i) {
-            const auto rgba = table[i];
-            color[i] = vec4f(rgba.xyz(), 1.f);
-            alpha[i] = rgba.w;
-        }
-        scene_tfn.color   = CreateArray1DFloat4(color);
-        scene_tfn.opacity = CreateArray1DScalar(alpha);
-    }
-
-    // set volume
-    HostVolume params;
-    commit(scene, params);
-
-    // process
-    vec3i  shadowmap_dims = vec3i(256); // vec3i(scene_vol.data->dims);
-    size_t shadowmap_size = shadowmap_dims.long_product();
-    CUDABuffer         shadowmap_gpu;
-    std::vector<float> shadowmap_cpu;
-    range1f shadowmap_range;
-
-    shadowmap_cpu.resize(shadowmap_size);
-    shadowmap_gpu.alloc(shadowmap_size * sizeof(float));
-
-    CUDA_SYNC_CHECK();
-
-    // call kernel to compute shadow volume
-    const int n_threads = 8;
-    const dim3 block_size(n_threads, n_threads, n_threads);
-    const dim3 grid_size(
-        misc::div_round_up(shadowmap_dims.x, n_threads),
-        misc::div_round_up(shadowmap_dims.y, n_threads),
-        misc::div_round_up(shadowmap_dims.z, n_threads)
-    );
-    ray_marching_kernel<<<grid_size, block_size>>>(shadowmap_dims, (void *)params.device_buffer.d_pointer(), (float *)shadowmap_gpu.d_pointer());
-
-    CUDA_SYNC_CHECK();
-
-    // shadowmap_gpu.download_async(shadowmap_cpu.data(), shadowmap_cpu.size());
-    cudaMemcpy(shadowmap_cpu.data(), (float*)shadowmap_gpu.d_pointer(), shadowmap_size * sizeof(float), cudaMemcpyDeviceToHost);
-
-    CUDA_SYNC_CHECK();
-
-    for (int i = 0; i < shadowmap_size; i++) {
-        shadowmap_range.extend(shadowmap_cpu[i]);
-    }
-    std::cout << "shadowmap range: " << shadowmap_range.lower << " " << shadowmap_range.upper << std::endl;
-
-    // save shadow volume to a binary file
-    std::ofstream outS(args.output() + ".bin", std::ios::out | std::ios::binary);
-    outS.write((char *)shadowmap_cpu.data(), shadowmap_cpu.size() * sizeof(float)); // <- This is where the code breaks
-    outS.close();
-
-    // save the scene file
-    json root;
-
-    // volume data
-    {
-        json data;
-        data["dimensions"] = { { "x", shadowmap_dims.x }, { "y", shadowmap_dims.y }, { "z", shadowmap_dims.z } };
-        data["endian"] = "LITTLE_ENDIAN";
-        data["fileName"] = args.output() + ".bin";
-        data["fileUpperLeft"] = false;
-        data["format"] = "REGULAR_GRID_RAW_BINARY";
-        data["id"] = 1;
-        data["name"] = "shadowmap";
-        data["offset"] = 0;
-        data["type"] = "FLOAT";
-        root["dataSource"].push_back(data);
-    }
-
-    root["original"] = args.scene();
-
-    // view
-    {
-        json& view = root["view"];
-
-        json& camera = view["camera"];
-        camera["center"] = { { "x", shadowmap_dims.x / 2.f }, { "y", shadowmap_dims.y / 2.f }, { "z", shadowmap_dims.z / 2.f } };
-        camera["eye"]    = { { "x", shadowmap_dims.x / 2.f }, { "y", shadowmap_dims.y / 2.f }, { "z", shadowmap_dims.z / 2.f - shadowmap_dims.z } };
-        camera["up"]     = { { "x", 0.f }, { "y", 1.f }, { "z", 0.f } };
-        camera["fovy"] = 60;
-        camera["projectionMode"] = "PERSPECTIVE";
-        camera["zFar"] = 2000;
-        camera["zNear"] = 1;
-
-        for (auto& li : params.lights) {
-            json light;
-            light["ambient"]  = { { "a", 1.f }, { "b", 1.f }, { "g", 1.f }, { "r", 1.f } };
-            light["specular"] = { { "a", 1.f }, { "b", 1.f }, { "g", 1.f }, { "r", 1.f } };
-            light["diffuse"]  = { { "a", 1.f }, { "r", li.intensity   }, { "g", li.intensity   }, { "b", li.intensity   } };
-            light["position"] = { { "w", 0.f }, { "x", li.direction.x }, { "y", li.direction.y }, { "z", li.direction.z } };
-            light["type"] = "DIRECTIONAL_LIGHT";
-            if (!view.contains("lightSource")) {
-                view["lightSource"] = light;
-            }
-            else {
-                view["additionalLightSources"].push_back(light);
-            }
-        }
-
-        view["lighting"] = true;
-        view["lightingSide"] = "FRONT_SIDE";
-        view["tfPreIntegration"] = false;
-
-        auto& vol = view["volume"];
-        vol["dataId"] = 1;
-        vol["interpolationType"] = "LINEAR_INTERPOLATION";
-        vol["opacityUnitDistance"] = 1;
-        vol["sampleDistance"] = params.device.step;
-        vol["scalarMappingRange"] = {
-            { "maximum", 1.0 }, { "minimum", 0.0 } // we should not normalize a shadow map
-        };
-        vol["transferFunctionType"] =  "TRANSFER_FUNCTION";
-        vol["visible"] = true;
-
-        // transfer function
-        tfn::saveTransferFunction(tfn, vol["transferFunction"]);
-    }
-
-    // save as text file
-    std::ofstream outJ(args.output() + ".json", std::ios::out);
-    outJ << std::setw(4) << root << std::endl;
-    outJ.close();
-
-    std::cout << "Ended" << std::endl;
-    return 0;
-}
-
-// command to train a neural network
-// ../../instant-vnr-cuda/run.sh ../../instant-vnr-cuda/build/Release/vnr_cmd_train --volume ./shadowmap.json   --max-num-steps 10000 --mode GPU --network network.json 
-
-// command to run
-// bash ../scripts/run.sh ../build/Debug/renderapp configs/scene_mechhand.json nnvolume
diff --git a/apps/shadowmap.usda b/apps/shadowmap.usda
deleted file mode 100644
index b2db23d..0000000
--- a/apps/shadowmap.usda
+++ /dev/null
@@ -1,48 +0,0 @@
-#usda 1.0
-
-def "scene" {
-    def "volume" {
-        string data_path = "configs/scene_chameleon.json"
-        # string data_path = "configs/scene_mechhand.json"
-    }
-    def "camera" {
-        # from = eye
-        float3 from = (
-            -1818.01,
-            2496.51,
-            -1875.73
-        )
-        # at = center
-        float3 at = (
-            790.71,
-            1057.9441,
-            926.1431
-        )
-        # up = up
-        float3 up = (
-            0.608985,
-            -0.313457,
-            -0.727943
-        )
-    }
-    def "light" {
-        def "ambient" {
-            def "first_light" {
-                float  intensity = 1
-                float3 color     = (1, 1, 1)
-            }
-        }
-        def "directional" {
-            def "first_light" {
-                float  intensity = 2
-                float3 direction = (0, -10, 0)
-                float3 color     = (1, 1, 1)
-            }
-            def "second_light" {
-                float  intensity = 2
-                float3 direction = (0, 10, 0)
-                float3 color     = (1, 1, 1)
-            }
-        }
-    }
-}
diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt
index f527170..9b2cd28 100644
--- a/device/CMakeLists.txt
+++ b/device/CMakeLists.txt
@@ -32,7 +32,6 @@ add_library(device_nnvolume ${_dev_type}
   $<TARGET_OBJECTS:device_nnvolume_array>
   device.cpp
   device_impl.cpp
-  # method_shadowmap.cu  # uncomment to include shadow-map GPU code
 )
 set_target_properties(device_nnvolume
 PROPERTIES
diff --git a/device/method_shadowmap.cu b/device/method_shadowmap.cu
deleted file mode 100644
index 53900b1..0000000
--- a/device/method_shadowmap.cu
+++ /dev/null
@@ -1,759 +0,0 @@
-//. ======================================================================== //
-//.                                                                          //
-//. Copyright 2019-2022 Qi Wu                                                //
-//.                                                                          //
-//. Licensed under the MIT License                                           //
-//.                                                                          //
-//. ======================================================================== //
-
-#include "method_shadowmap.h"
-#include "raytracing.h"
-#include "dda.h"
-
-#include <cuda/cuda_buffer.h>
-
-#ifndef ADAPTIVE_SAMPLING
-#error "ADAPTIVE_SAMPLING is not defined"
-#endif
-
-
-INSTANT_VNR_NAMESPACE_BEGIN
-
-constexpr auto N_ITERS = 16;
-
-using ShadingMode = MethodShadowMap::ShadingMode;
-constexpr auto NO_SHADING = MethodShadowMap::NO_SHADING;
-constexpr auto SHADING    = MethodShadowMap::SHADING;
-
-// using vnr::SciVisMaterial;
-
-// ------------------------------------------------------------------
-//
-// ------------------------------------------------------------------
-
-// NOTE: what is the best SoA layout here?
-
-struct ShadowMapData : LaunchParams
-{
-  ShadowMapData(const LaunchParams& p) : LaunchParams(p) {}
-
-  ShadingMode mode;
-
-  SciVisMaterial material{ 2.f, 1.5f, .4f, 40.f };
-
-  DeviceVolume* __restrict__ volume{ nullptr };
-
-  // belows are only useful for sampling streaming
-  uint32_t* __restrict__ counter{ nullptr };
-
-  vec3f* __restrict__ inference_input { nullptr };
-  float* __restrict__ inference_output{ nullptr };
-
-  // per ray payload (ordered by ray index)
-  uint32_t* __restrict__ pixel_index{ nullptr };
-  float* __restrict__ jitter{ nullptr };
-  float* __restrict__ alpha{ nullptr };
-  vec3f* __restrict__ color_or_org{ nullptr };
-#if ADAPTIVE_SAMPLING
-  vec3f* __restrict__ iter_t_next{ nullptr };
-  vec3i* __restrict__ iter_cell{ nullptr };
-#endif
-  float* __restrict__ iter_next_cell_begin{ nullptr };
-
-  // belows are only used by SSH
-  vec3f* __restrict__ inter_highest_org  { nullptr }; // ordered by ray index
-  float* __restrict__ inter_highest_alpha{ nullptr };
-  vec3f* __restrict__ inter_highest_color{ nullptr };
-  vec3f* __restrict__ final_highest_org  { nullptr }; // ordered by pixel index
-  float* __restrict__ final_highest_alpha{ nullptr };
-  vec3f* __restrict__ final_highest_color{ nullptr };
-  vec4f* __restrict__ shading_color{ nullptr }; // ordered by pixel index
-  float* __restrict__ jitter_ssh{ nullptr };
-};
-
-/* standard version */ void
-do_raymarching_trivial(cudaStream_t stream, const ShadowMapData& params);
-
-/* iterative version */ void
-do_raymarching_iterative(cudaStream_t stream, const ShadowMapData& params, NeuralVolume* network, uint32_t numPixels);
-
-// ------------------------------------------------------------------
-//
-// ------------------------------------------------------------------
-
-namespace {
-
-// ------------------------------------------------------------------
-
-
-// ------------------------------------------------------------------
-
-inline __device__ uint32_t 
-new_ray_index(const ShadowMapData& params)
-{
-  return atomicAdd(params.counter, 1);
-}
-
-template<typename T>
-inline T* define_buffer(char* begin, size_t& offset, size_t buffer_size)
-{
-  auto* ret = (T*)(begin + offset); 
-  offset += buffer_size * sizeof(T);
-  return ret;
-}
-
-// -------
-//
-// -------
-
-struct Ray 
-{
-  vec3f org{};
-  vec3f dir{};
-  float alpha = 0.f;
-  vec3f color = 0.f; // not used by shadow rays
-};
-
-// -------
-//
-// -------
-
-struct RayMarchingIter 
-#if ADAPTIVE_SAMPLING
-  : private dda::DDAIter
-#endif
-{
-#if ADAPTIVE_SAMPLING
-  using DDAIter::cell;
-  using DDAIter::t_next;
-  using DDAIter::next_cell_begin;
-#else
-  float next_cell_begin{};
-#endif
-
-  __device__ RayMarchingIter() {}
-  __device__ RayMarchingIter(const DeviceVolume& self, const vec3f& org, const vec3f& dir, const float tMin, const float tMax);
-  bool __device__ resumable(const DeviceVolume& self, vec3f dir, float t_min, float t_max);
-
-  template<typename F>
-  __device__ void exec(const DeviceVolume& self, const vec3f& org, const vec3f& dir, const float tMin, const float tMax, const float step, const uint32_t pidx, const F& body);
-};
-
-__device__
-RayMarchingIter::RayMarchingIter(const DeviceVolume& self, const vec3f& org, const vec3f& dir, const float tMin, const float tMax)
-{
-#if ADAPTIVE_SAMPLING
-  const auto& dims = self.macrocell_dims;
-  const vec3f m_org = org * self.macrocell_spacings_rcp;
-  const vec3f m_dir = dir * self.macrocell_spacings_rcp;
-  DDAIter::init(m_org, m_dir, tMin, tMax, dims);
-#endif
-}
-
-template<typename F>
-__device__ void
-RayMarchingIter::exec(const DeviceVolume& self, const vec3f& org, const vec3f& dir, const float tMin, const float tMax, const float step, const uint32_t pidx, const F& body)
-{
-#if ADAPTIVE_SAMPLING
-
-  const auto& dims = self.macrocell_dims;
-  const vec3f m_org = org * self.macrocell_spacings_rcp;
-  const vec3f m_dir = dir * self.macrocell_spacings_rcp;
-
-  const auto lambda = [&](const vec3i& cell, float t0, float t1) {
-    // calculate max opacity
-    float r = opacityUpperBound(self, cell);
-    if (fabsf(r) <= float_epsilon) return true; // the cell is empty
-    // estimate a step size
-    const auto ss = adaptiveSamplingRate(step, r);
-    // iterate within the interval
-    vec2f t = vec2f(t0, min(t1, t0 + ss));
-    while (t.y > t.x) {
-      DDAIter::next_cell_begin = t.y - tMin;
-      if (!body(t)) return false;
-      t.x = t.y;
-      t.y = min(t.x + ss, t1);
-    }
-    return true;
-  };
-
-  while (DDAIter::next(m_org, m_dir, tMin, tMax, dims, false, lambda)) {}
-
-#else
-
-  vec2f t;
-  t.x = max(tMin + next_cell_begin, tMin);
-  t.y = min(t.x + step, tMax);
-  while (t.y > t.x) {
-    next_cell_begin = t.y - tMin;
-    if (!body(t)) return;
-    t.x = t.y;
-    t.y = min(t.x + step, tMax);
-  }
-
-  next_cell_begin = float_large;
-  return;
-
-#endif
-}
-
-bool __device__
-RayMarchingIter::resumable(const DeviceVolume& self, vec3f dir, float tMin, float tMax)
-{
-#if ADAPTIVE_SAMPLING
-  const auto& dims = self.macrocell_dims;
-  const vec3f m_dir = dir * self.macrocell_spacings_rcp;
-  return DDAIter::resumable(m_dir, tMin, tMax, dims);
-#else
-  return tMin + next_cell_begin < tMax;
-#endif
-}
-
-
-// -------
-//
-// -------
-
-struct SampleStreamingPayload
-{
-public:
-  uint32_t pixel_index = 0;
-  float jitter = 0.f;
-  RayMarchingIter iter;
-
-private:
-  union {
-    vec3f color;
-    vec3f org;
-  };
-  float alpha = 0.f;
-
-public:
-  __device__ SampleStreamingPayload(const uint32_t pixel_index, const float jitter) : pixel_index(pixel_index), jitter(jitter), color(0) {}
-  __device__ SampleStreamingPayload(const ShadowMapData& params, const uint32_t ray_index); // load a payload from memory
-  __device__ void save(const ShadowMapData& params, uint32_t ridx) const;
-  // __device__ void as_camera_ray(const vec3f& c, const float& a) { color = c, alpha = a; }
-  // __device__ void as_shadow_ray(const vec3f& o) { org = o; }
-  __device__ void set_ray(const Ray& ray) { alpha = ray.alpha, color = ray.color; }
-  __device__ Ray compute_ray(const ShadowMapData& params) const;
-};
-
-__device__
-SampleStreamingPayload::SampleStreamingPayload(const ShadowMapData& params, const uint32_t ray_index) 
-{
-  pixel_index = params.pixel_index[ray_index];
-  jitter = params.jitter[ray_index];
-  alpha = params.alpha[ray_index];
-  color = params.color_or_org[ray_index];
-#if ADAPTIVE_SAMPLING
-  iter.cell = params.iter_cell[ray_index];
-  iter.t_next = params.iter_t_next[ray_index];
-#endif
-  iter.next_cell_begin = params.iter_next_cell_begin[ray_index];
-}
-
-__device__ void
-SampleStreamingPayload::save(const ShadowMapData& params, uint32_t ridx) const
-{
-  params.pixel_index[ridx] = pixel_index;
-  params.jitter[ridx] = jitter;
-  params.alpha[ridx] = alpha;
-  params.color_or_org[ridx] = color;
-#if ADAPTIVE_SAMPLING
-  params.iter_cell[ridx] = iter.cell;
-  params.iter_t_next[ridx] = iter.t_next;
-#endif
-  params.iter_next_cell_begin[ridx] = iter.next_cell_begin;
-}
-
-__device__ Ray 
-SampleStreamingPayload::compute_ray(const ShadowMapData& params) const
-{
-  const auto& fbIndex = pixel_index;
-
-  // compute pixel ID
-  const uint32_t ix = fbIndex % params.frame.size.x;
-  const uint32_t iy = fbIndex / params.frame.size.x;
-
-  // normalized screen plane position, in [0,1]^2
-  const auto& camera = params.camera;
-  const vec2f screen(vec2f((float)ix + .5f, (float)iy + .5f) / vec2f(params.frame.size));
-
-  // get the object to world transformation
-  const affine3f& otw = params.transform;
-  const affine3f wto = otw.inverse();
-
-  // generate ray direction
-  Ray ray;
-  ray.org = xfmPoint(wto, camera.position);
-  ray.dir = xfmVector(wto, normalize(/* -z axis */ camera.direction +
-                                     /* x shift */ (screen.x - 0.5f) * camera.horizontal +
-                                     /* y shift */ (screen.y - 0.5f) * camera.vertical));
-  ray.alpha = alpha;
-  ray.color = color;
-  return ray;
-}
-
-// ------------------------------------------------------------------
-
-}
-
-// ------------------------------------------------------------------
-//
-// ------------------------------------------------------------------
-
-void
-MethodShadowMap::render(cudaStream_t stream, const LaunchParams& _params, ShadingMode mode, DeviceVolume* volume, NeuralVolume* network, bool iterative)
-{
-  ShadowMapData params = _params;
-
-  const uint32_t numPixels = (uint32_t)params.frame.size.long_product();
-
-  params.volume = volume;
-  params.mode = mode;
-
-  if (iterative) {
-    const uint32_t nSamplesPerCoord = N_ITERS;
-
-    size_t nBytes = numPixels * nSamplesPerCoord * sizeof(vec4f); // inference input + output
-    nBytes += numPixels * sizeof(SampleStreamingPayload); // ray payloads
-    nBytes += numPixels * sizeof(RayMarchingIter); // iterators
-    nBytes += sizeof(uint32_t); // counter
-
-    sample_streaming_buffer.resize(nBytes, stream);
-    CUDA_CHECK(cudaMemsetAsync((void*)sample_streaming_buffer.d_pointer(), 0, nBytes, stream)); // initialize all buffers
-
-    char* begin = (char*)sample_streaming_buffer.d_pointer();
-    size_t offset = 0;
-
-    // allocate staging data
-    params.inference_input  = define_buffer<vec3f>(begin, offset, numPixels * nSamplesPerCoord);
-    params.inference_output = define_buffer<float>(begin, offset, numPixels * nSamplesPerCoord);
-
-    // allocate payload data 
-    params.alpha        = define_buffer<float>(begin, offset, numPixels);
-    params.color_or_org = define_buffer<vec3f>(begin, offset, numPixels);
-    params.pixel_index  = define_buffer<uint32_t>(begin, offset, numPixels);
-    params.jitter       = define_buffer<float>(begin, offset, numPixels);
-#if ADAPTIVE_SAMPLING
-    params.iter_cell   = define_buffer<vec3i>(begin, offset, numPixels);
-    params.iter_t_next = define_buffer<vec3f>(begin, offset, numPixels);
-#endif
-    params.iter_next_cell_begin = define_buffer<float>(begin, offset, numPixels);
-
-    // we also need a launch index buffer
-    params.counter = define_buffer<uint32_t>(begin, offset, 1);
-  }
-
-  if (iterative) {
-    do_raymarching_iterative(stream, params, network, numPixels);
-  }
-  else {
-    do_raymarching_trivial(stream, params);
-  }
-}
-
-inline __device__ float
-sample_size_scaler(const float ss, const float t0, const float t1) {
-  const int32_t N = (t1-t0) / ss + 1;
-  return (t1-t0) / N;
-  // return ss;
-}
-
-template<typename F>
-inline __device__ void
-raymarching_iterator(const DeviceVolume& self, 
-                     const vec3f& org, const vec3f& dir,
-                     const float tMin, const float tMax, 
-                     const float step, const F& body, 
-                     bool debug = false)
-{
-#if ADAPTIVE_SAMPLING
-
-  const auto& dims = self.macrocell_dims;
-  const vec3f m_org = org * self.macrocell_spacings_rcp;
-  const vec3f m_dir = dir * self.macrocell_spacings_rcp;
-  dda::dda3(m_org, m_dir, tMin, tMax, dims, debug, [&](const vec3i& cell, float t0, float t1) {
-    // calculate max opacity
-    float r = opacityUpperBound(self, cell);
-    if (fabsf(r) <= float_epsilon) return true; // the cell is empty
-    // estimate a step size
-    const auto ss = sample_size_scaler(adaptiveSamplingRate(step, r), t0, t1);
-    // iterate within the interval
-    vec2f t = vec2f(t0, min(t1, t0 + ss));
-    while (t.y > t.x) {
-      if (!body(t)) return false;
-      t.x = t.y;
-      t.y = min(t.x + ss, t1);
-    }
-    return true;
-  });
-
-#else
-
-  vec2f t = vec2f(tMin, min(tMax, tMin + step));
-  while ((t.y > t.x) && body(t)) {
-    t.x = t.y;
-    t.y = min(t.x + step, tMax);
-  }
-
-#endif
-}
-
-//------------------------------------------------------------------------------
-//
-// ------------------------------------------------------------------------------
-
-inline __device__ float
-raymarching_transmittance(const DeviceVolume& self,
-                          const ShadowMapData& params,
-                          const vec3f& org, const vec3f& dir,
-                          float t0, float t1,
-                          float sampling_scale,
-                          RandomTEA& rng)
-{
-  const auto marching_step = sampling_scale * self.step;
-  float alpha(0);
-  if (intersectVolume(t0, t1, org, dir, self)) {
-    // jitter ray to remove ringing effects
-    const float jitter = rng.get_floats().x;
-    // start marching
-    raymarching_iterator(self, org, dir, t0, t1, marching_step, [&](const vec2f& t) {
-      // sample data value
-      const auto p = org + lerp(jitter, t.x, t.y) * dir; // object space position
-      const auto sampleValue = sampleVolume(self.volume, p);
-      // classification
-      vec3f sampleColor;
-      float sampleAlpha;
-      sampleTransferFunction(self.tfn, sampleValue, sampleColor, sampleAlpha);
-      opacityCorrection(self, t.y - t.x, sampleAlpha);
-      // blending
-      alpha += (1.f - alpha) * sampleAlpha;
-      return alpha < nearly_one;
-    });
-  }
-  return 1.f - alpha;
-}
-
-inline __device__ vec3f
-shade_scivis_light(const vec3f& ray_dir, const vec3f& normal, const vec3f& albedo, const SciVisMaterial& mat)
-{
-  vec3f color = 0.f;
-
-  if (dot(normal, normal) > 1.0e-6) {
-    const auto N = normalize(normal);
-    const auto V = -ray_dir;
-    color += mat.ambient * albedo;
-    const float cosNL = std::max(dot(N, V), 0.f);
-    if (cosNL > 0.0f) {
-      color += mat.diffuse * cosNL * albedo;
-      const vec3f H = normalize(N + V);
-      const float cosNH = std::max(dot(N, H), 0.f);
-      color += mat.specular * powf(cosNH, mat.shininess);
-    }
-  }
-
-  const vec3f shading2 = shade_simple_light(ray_dir, normal, albedo);
-
-  return lerp(0.5, shading2, color);
-}
-
-inline __device__ vec4f
-raymarching_traceray(const DeviceVolume& self,
-                     const ShadowMapData& params,
-                     const affine3f& wto, // world to object
-                     const affine3f& otw, // object to world
-                     const Ray& ray, float t0, float t1,
-                     RandomTEA& rng)
-{
-  const auto& marchingStep = self.step;
-  const auto& gradientStep = self.grad_step;
-  // const auto& shadingScale = params.scivis_shading_scale;
-
-  float alpha(0);
-  vec3f color(0);
-
-  if (intersectVolume(t0, t1, ray.org, ray.dir, self)) {
-
-    auto w_org = xfmVector(otw, ray.org);
-    auto w_dir = xfmVector(otw, ray.dir);
-
-    // jitter ray to remove ringing effects
-    const float jitter = rng.get_floats().x;
-
-    // start marching
-    raymarching_iterator(self, ray.org, ray.dir, t0, t1, marchingStep, [&](const vec2f& t) {
-      assert(t.x < t.y);
-
-      // sample data value
-      const auto p = ray.org + lerp(jitter, t.x, t.y) * ray.dir; // object space position
-      const auto sampleValue = sampleVolume(self.volume, p);
-
-      // classification
-      vec3f sampleColor;
-      float sampleAlpha;
-      sampleTransferFunction(self.tfn, sampleValue, sampleColor, sampleAlpha);
-      opacityCorrection(self, t.y - t.x, sampleAlpha);
-
-      // access gradient
-      const vec3f No = -sampleGradient(self.volume, p, sampleValue, gradientStep); // sample gradient
-      const vec3f Nw = xfmNormal(otw, No);
-
-      const float tr = 1.f - alpha;
-
-      // object space to world space
-      const auto ldir = xfmVector(wto, normalize(params.light_directional_dir));
-      const auto rdir = xfmVector(otw, ray.dir);
-      // single shade
-      const float transmittance = raymarching_transmittance(self, params, p, ldir, 0.f, float_large, /*make baseline more expensive...*/ 1.0, rng);
-      sampleColor = lerp(0.8, sampleColor, transmittance * sampleColor);
-
-      color += tr * sampleColor * sampleAlpha;
-      alpha += tr * sampleAlpha;
-
-      return alpha < nearly_one;
-    });
-
-  }
-
-  return vec4f(color, alpha);
-}
-
-__global__ void
-raymarching_kernel(uint32_t width, uint32_t height, const ShadowMapData params)
-{
-  // compute pixel ID
-  const size_t ix = threadIdx.x + blockIdx.x * blockDim.x;
-  const size_t iy = threadIdx.y + blockIdx.y * blockDim.y;
-
-  if (ix >= width)  return;
-  if (iy >= height) return;
-
-  const auto& volume = *params.volume;
-  assert(width  == params.frame.size.x && "incorrect framebuffer size");
-  assert(height == params.frame.size.y && "incorrect framebuffer size");
-
-  // normalized screen plane position, in [0,1]^2
-  const auto& camera = params.camera;
-  const vec2f screen(vec2f((float)ix + .5f, (float)iy + .5f) / vec2f(params.frame.size));
-
-  // get the object to world transformation
-  const affine3f otw = params.transform;
-  const affine3f wto = otw.inverse();
-
-  // pixel index
-  const uint32_t fbIndex = ix + iy * width;
-
-  // random number generator
-  RandomTEA rng_state(params.frame_index, fbIndex);
-
-  // generate ray direction
-  Ray ray;
-  ray.org = xfmPoint(wto, camera.position);
-  ray.dir = xfmVector(wto, normalize(/* -z axis */ camera.direction +
-                                     /* x shift */ (screen.x - 0.5f) * camera.horizontal +
-                                     /* y shift */ (screen.y - 0.5f) * camera.vertical));
-
-  // trace ray
-  const vec4f output = raymarching_traceray(volume, params, wto, otw, ray, 0.f, float_large, rng_state);
-
-  // and write to frame buffer ...
-  writePixelColor(params, output, fbIndex);
-}
-
-void
-do_raymarching_trivial(cudaStream_t stream, const ShadowMapData& params)
-{
-  util::bilinear_kernel(raymarching_kernel, 0, stream, params.frame.size.x, params.frame.size.y, params);
-}
-
-
-
-// ------------------------------------------------------------------------------
-//
-// ------------------------------------------------------------------------------
-
-__global__ void
-iterative_intersect_kernel(uint32_t numRays, const ShadowMapData params, int N_ITERS)
-{
-  const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i >= numRays) return;
-
-  // other constants
-  const auto& self = *(params.volume);
-
-  // load payloads and rays
-  SampleStreamingPayload payload(params, i);
-  const Ray ray = payload.compute_ray(params);
-
-  float tmin = 0.f, tmax = float_large;
-  const bool hashit = intersectVolume(tmin, tmax, ray.org, ray.dir, self);
-  assert(hashit);
-
-  vec3f* __restrict__ coords = (vec3f*)params.inference_input;
-
-  int k = 0;
-  payload.iter.exec(self, ray.org, ray.dir, tmin, tmax, self.step, payload.pixel_index, [&](const vec2f& t) {
-    assert(k < N_ITERS);
-    assert(t.x < t.y);
-    const vec3f c = ray.org + lerp(payload.jitter, t.x, t.y) * ray.dir;
-    coords[numRays * k + i] = c;
-    return (++k) < N_ITERS;
-  });
-}
-
-template<ShadingMode MODE> 
-__global__ void
-iterative_compose_kernel(uint32_t numRays, const ShadowMapData params, int N_ITERS)
-{
-  const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i >= numRays) return;
-
-  const auto& self = *(params.volume);
-  // const auto& shadingScale = params.scivis_shading_scale;
-  const auto* __restrict__ shadingCoefs = params.inference_output;
-
-  SampleStreamingPayload payload(params, i);
-  Ray ray = payload.compute_ray(params);
-
-  float tmin = 0.f, tmax = float_large;
-  const bool hashit = intersectVolume(tmin, tmax, ray.org, ray.dir, self);
-  assert(hashit);
-  
-  const affine3f& otw = params.transform;
-
-  int k = 0;
-  payload.iter.exec(self, ray.org, ray.dir, tmin, tmax, self.step, payload.pixel_index, [&](const vec2f& t) {
-    assert(k < N_ITERS);
-    assert(t.x < t.y);
-
-    // classification
-    const auto c = ray.org + lerp(payload.jitter, t.x, t.y) * ray.dir;
-    const auto sampleValue = sampleVolume(self.volume, c);
-    vec3f sampleColor;
-    float sampleAlpha;
-    sampleTransferFunction(self.tfn, sampleValue, sampleColor, sampleAlpha);
-    opacityCorrection(self, t.y - t.x, sampleAlpha);
-
-    // access gradient
-    const vec3f No = -sampleGradient(self.volume, c, sampleValue, self.grad_step); // sample gradient
-    const vec3f Nw = xfmNormal(otw, No);
-
-    // shading
-    if (MODE == SHADING) {
-      float coef = clamp(shadingCoefs[numRays * k + i], 0.f, 1.f);
-      const auto rdir = xfmVector(otw, ray.dir);
-      const vec3f shadingColor = shade_scivis_light(rdir, Nw, sampleColor, params.material);
-      sampleColor = lerp(0.8, sampleColor, coef * shadingColor);
-    }
-
-    // blending
-    const float tr = 1.f - ray.alpha;
-    ray.alpha += tr * sampleAlpha;
-    ray.color += tr * sampleColor * sampleAlpha;
-
-    // conditions to continue iterating
-    return ((++k) < N_ITERS) && (ray.alpha < nearly_one);
-  });
-
-  payload.set_ray(ray);
-  const bool resumable = payload.iter.resumable(self, ray.dir, tmin, tmax);
-  if (ray.alpha < nearly_one && resumable) {
-    payload.save(params, new_ray_index(params));
-  }
-  else {
-    writePixelColor(params, vec4f(ray.color, ray.alpha), payload.pixel_index);
-  }
-}
-
-__global__ void
-iterative_raygen_kernel_camera(uint32_t numRays, const ShadowMapData params) 
-{
-  // compute ray ID
-  const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i >= numRays) return;
-
-  // generate data
-  const auto& self = *((DeviceVolume*)params.volume);
-
-  // random number generator
-  RandomTEA rng = RandomTEA(params.frame_index, i);
-  vec2f jitters = rng.get_floats();
-  // payload & ray
-  SampleStreamingPayload payload(i, jitters.x);
-  const Ray ray = payload.compute_ray(params);
-
-  // intersect with volume bbox & write outputs
-  float tmin = 0.f, tmax = float_large;
-  if (intersectVolume(tmin, tmax, ray.org, ray.dir, self)) {
-    payload.iter = RayMarchingIter(self, ray.org, ray.dir, tmin, tmax);
-    payload.save(params, new_ray_index(params));
-  }
-  else {
-    writePixelColor(params, vec4f(ray.color, ray.alpha), payload.pixel_index);
-  }
-}
-
-__global__ void
-iterative_sampling_groundtruth_kernel(uint32_t numRays, const ShadowMapData params)
-{
-  const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i >= numRays) return;
-
-  const auto& volume = params.volume->volume;
-  
-  const vec3f* __restrict__ inputs = params.inference_input;
-  float* __restrict__ outputs = params.inference_output;
-
-  const auto p = inputs[i];
-  outputs[i] = sampleVolume(volume, p);
-}
-
-void
-iterative_sampling_batch_inference(cudaStream_t stream, uint32_t numRays, const ShadowMapData& params, NeuralVolume* network)
-{
-  network->inference(numRays, (float*)params.inference_input, params.inference_output, stream);
-}
-
-inline bool 
-iterative_ray_compaction(cudaStream_t stream, uint32_t& count, uint32_t* dptr)
-{
-  CUDA_CHECK(cudaMemcpyAsync(&count, dptr, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  return count > 0;
-}
-
-template<ShadingMode MODE> 
-void iterative_raymarching_loop(cudaStream_t stream, const ShadowMapData& params, NeuralVolume* network, uint32_t numRays)
-{
-  const uint32_t numCoordsPerSample = N_ITERS;
-
-  CUDA_CHECK(cudaMemsetAsync(params.counter, 0, sizeof(int32_t), stream));
-  util::linear_kernel(iterative_raygen_kernel_camera, 0, stream, numRays, params);
-
-  while (iterative_ray_compaction(stream, numRays, params.counter)) {
-    // Actually, we could have merged the intersection step with raygen and compose. However, there was a wired error 
-    // and I did not figure out irs origin. Also, having the intersection step inside raygen and compose did not bring
-    // obvious performance benefit, so I left it as it is for now.
-    util::linear_kernel(iterative_intersect_kernel, 0, stream, numRays, params, N_ITERS);
-
-    if (network)
-      iterative_sampling_batch_inference(stream, numCoordsPerSample * numRays, params, network);
-    else
-      util::linear_kernel(iterative_sampling_groundtruth_kernel, 0, stream, numCoordsPerSample * numRays, params);
-
-    CUDA_CHECK(cudaMemsetAsync(params.counter, 0, sizeof(int32_t), stream));
-    util::linear_kernel(iterative_compose_kernel<MODE>, 0, stream, numRays, params, N_ITERS);
-  }
-}
-
-void
-do_raymarching_iterative(cudaStream_t stream, const ShadowMapData& params, NeuralVolume* network, uint32_t numRays)
-{
-  if (params.mode == NO_SHADING)
-    iterative_raymarching_loop<NO_SHADING>(stream, params, network, numRays);
-  else
-    iterative_raymarching_loop<SHADING>(stream, params, network, numRays);
-}
-
-INSTANT_VNR_NAMESPACE_END
diff --git a/device/method_shadowmap.h b/device/method_shadowmap.h
deleted file mode 100644
index b7fdd09..0000000
--- a/device/method_shadowmap.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-
-#include "../instantvnr_types.h"
-#include "../network.h"
-
-#include <cuda/cuda_buffer.h>
-
-
-INSTANT_VNR_NAMESPACE_BEGIN
-
-class MethodShadowMap
-{
-public:
-  enum ShadingMode { NO_SHADING = 0, SHADING };
-
-  ~MethodShadowMap() { clear(0); }
-  void render(cudaStream_t stream, const LaunchParams& params, ShadingMode mode, DeviceVolume* volume, NeuralVolume* nvr = nullptr, bool iterative = false);
-  void clear(cudaStream_t stream) { sample_streaming_buffer.free(stream); }
-
-private:
-  CUDABuffer sample_streaming_buffer;
-};
-
-INSTANT_VNR_NAMESPACE_END

From 2aebc10fcf44e857112e226b320d9d883f227919 Mon Sep 17 00:00:00 2001
From: Qi Wu <qiwu@nvidia.com>
Date: Sun, 19 Apr 2026 09:40:49 -0700
Subject: [PATCH 3/5] Update submodule URL and branch for open-volume-renderer
 in .gitmodules

---
 .gitmodules | 3 ++-
 base        | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index f2652c9..20a3b43 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,4 +3,5 @@
 	url = https://github.com/wilsonCernWq/tiny-cuda-nn.git
 [submodule "base"]
 	path = base
-	url = git@github.com:wilsonCernWq/open-volume-renderer.git
+	url = https://github.com/VIDILabs/open-volume-renderer.git
+	branch = reorg
diff --git a/base b/base
index d66050c..e13dceb 160000
--- a/base
+++ b/base
@@ -1 +1 @@
-Subproject commit d66050c1e2bcf7755cd70efa86b3efdcf0eaef01
+Subproject commit e13dcebf2b0da6c4771158118a6d921dd8af22b5

From 0c9e1435b2651ca1ff5cf1c530e6ddbdcafeec56 Mon Sep 17 00:00:00 2001
From: Qi Wu <qiwu@nvidia.com>
Date: Mon, 20 Apr 2026 15:27:19 -0700
Subject: [PATCH 4/5] Add .dockerignore file and update CMake configuration for
 standalone builds

- Introduced a .dockerignore file to exclude unnecessary files from Docker builds.
- Updated CMakeLists.txt to support installation as a relocatable package and improved target installation settings.
- Enhanced setup_cmake.sh to allow for separate installation steps.
- Modified Dockerfile to use a newer CUDA base image and adjusted build commands for better clarity.
- Improved README with instructions for using instantvnr as a CMake package in other projects.
---
 .dockerignore                      | 24 +++++++++
 .github/workflows/docker-image.yml | 46 ++++++++++++----
 CMakeLists.txt                     | 85 ++++++++++++++++++++++++++++--
 Dockerfile                         | 62 ++++++++++------------
 README.md                          | 25 ++++++++-
 apps/CMakeLists.txt                | 23 ++++++++
 base                               |  2 +-
 core/CMakeLists.txt                | 20 +++++--
 setup_cmake.sh                     | 14 ++++-
 9 files changed, 246 insertions(+), 55 deletions(-)
 create mode 100644 .dockerignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..b9a6c1b
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,24 @@
+.git
+**/.git
+.github/
+.cursor/
+
+build
+build/
+build_*/
+base/build/
+cmake-build-*/
+dist/
+
+__pycache__/
+*.py[cod]
+.pytest_cache/
+.mypy_cache/
+.cache/
+.venv/
+env/
+venv/
+
+.vscode/
+.idea/
+imgui.ini
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 4a1bf08..7fb8ae3 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -2,20 +2,46 @@ name: Docker Image CI
 
 on:
   push:
-    branches: [ "public", "github-actions" ]
+    branches: [main, master, public, reorg]
   pull_request:
-    branches: [ "public", "github-actions" ]
+    branches: [main, master, public, reorg]
 
-jobs:
-
-  build:
+concurrency:
+  group: docker-${{ github.head_ref || github.ref }}
+  cancel-in-progress: true
 
+jobs:
+  docker:
+    name: Docker (CUDA_ARCH=${{ matrix.cuda_arch }})
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_arch: ["86"]
 
     steps:
-    - uses: actions/checkout@v3
-      with:
-        submodules: 'recursive'
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: true
+
+      - name: Build Docker image
+        run: |
+          docker build . \
+            --file Dockerfile \
+            --build-arg CUDA_ARCH=${{ matrix.cuda_arch }} \
+            --tag instantvnr:ci-${{ github.sha }}
 
-    - name: Build the Docker image
-      run: docker build . --file Dockerfile --tag instantvnr:$(date +%s)
+      - name: Verify install prefix exists
+        run: |
+          docker run --rm instantvnr:ci-${{ github.sha }} \
+            ls /instantvnr/install/lib/cmake/instantvnr/instantvnrConfig.cmake
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18f6efb..12fb28c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,8 +5,8 @@
 # ======================================================================== #
 #
 # Standalone build — OVR is an in-tree submodule, TCNN is built from source
-# via FetchContent, and the result is installed as a self-contained
-# `instantvnr` Python package directory.
+# via FetchContent, and the result can be installed as a relocatable CMake
+# package that downstream projects consume via `find_package(instantvnr)`.
 #
 
 if (POLICY CMP0048)
@@ -14,11 +14,20 @@ if (POLICY CMP0048)
 endif()
 
 cmake_minimum_required(VERSION 3.24)
-project(instantvnr LANGUAGES C CXX CUDA)
+project(instantvnr VERSION 0.0.0 LANGUAGES C CXX CUDA)
 
 # OVR cmake modules (configure_build_type, configure_cxx, configure_cuda …)
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/base/cmake")
 
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+set(INSTANTVNR_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(INSTANTVNR_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/instantvnr")
+set(INSTANTVNR_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/instantvnr")
+# Tell the embedded OVR tree where its public headers land in this package.
+set(OVR_INSTALL_INCLUDEDIR "${INSTANTVNR_INSTALL_INCLUDEDIR}")
+
 include(configure_build_type)
 include(configure_cxx)
 include(configure_cuda)
@@ -117,3 +126,73 @@ add_subdirectory(core)
 # co-installed and available for optional OVR integration.
 add_subdirectory(device)
 add_subdirectory(apps)
+
+add_library(instantvnr::instantvnr ALIAS instantvnr)
+
+install(TARGETS instantvnr
+  EXPORT instantvnrTargets
+  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+)
+
+install(TARGETS gdt
+  EXPORT instantvnrTargets
+)
+
+install(TARGETS tfnmodule
+  EXPORT instantvnrTargets
+  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+)
+
+# The public `instantvnr` target depends on rendercommon at runtime, so install
+# it alongside the main shared library even though it is not part of the
+# exported CMake interface.
+install(TARGETS rendercommon
+  EXPORT instantvnrTargets
+  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+)
+
+# Install the header surface required by `api.h`.
+install(FILES
+  "${CMAKE_CURRENT_SOURCE_DIR}/api.h"
+  DESTINATION "${INSTANTVNR_INSTALL_INCLUDEDIR}"
+)
+install(FILES
+  "${CMAKE_CURRENT_SOURCE_DIR}/core/mathdef.h"
+  DESTINATION "${INSTANTVNR_INSTALL_INCLUDEDIR}/core"
+)
+install(DIRECTORY
+  "${CMAKE_CURRENT_SOURCE_DIR}/base/gdt/gdt"
+  DESTINATION "${INSTANTVNR_INSTALL_INCLUDEDIR}"
+)
+install(DIRECTORY
+  "${CMAKE_CURRENT_SOURCE_DIR}/base/extern/json"
+  DESTINATION "${INSTANTVNR_INSTALL_INCLUDEDIR}"
+)
+
+write_basic_package_version_file(
+  "${CMAKE_CURRENT_BINARY_DIR}/instantvnrConfigVersion.cmake"
+  VERSION "${PROJECT_VERSION}"
+  COMPATIBILITY SameMajorVersion
+)
+
+export(EXPORT instantvnrTargets
+  FILE "${CMAKE_CURRENT_BINARY_DIR}/instantvnrConfig.cmake"
+  NAMESPACE instantvnr::
+)
+
+install(EXPORT instantvnrTargets
+  FILE instantvnrConfig.cmake
+  NAMESPACE instantvnr::
+  DESTINATION "${INSTANTVNR_INSTALL_CMAKEDIR}"
+)
+
+install(FILES
+  "${CMAKE_CURRENT_BINARY_DIR}/instantvnrConfigVersion.cmake"
+  DESTINATION "${INSTANTVNR_INSTALL_CMAKEDIR}"
+)
diff --git a/Dockerfile b/Dockerfile
index 5c71a5f..71d641b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,16 +1,15 @@
-# Example Command to Run
-#   docker build -t instantvnr .
+# Example commands:
+#   docker build --build-arg CUDA_ARCH=86 -t instantvnr .
 #   xhost +si:localuser:root
-#   docker run --runtime=nvidia -ti --rm -e DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix -w /instantvnr/build instantvnr
+#   docker run --gpus all -ti --rm -e DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix -w /instantvnr/build instantvnr
 
-FROM nvidia/cuda:11.8.0-devel-ubuntu20.04
+FROM nvidia/cuda:12.8.1-devel-ubuntu24.04
 
 # Select a CUDA architecture to build. Currently we do not support multi-arch builds.
-ARG CUDA_ARCH=70
+ARG CUDA_ARCH=90
 ARG DEBIAN_FRONTEND=noninteractive
 
-RUN apt-get update 
-RUN apt-get install -y --no-install-recommends \
+RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential mesa-utils pkg-config \
         libglx0 libglvnd0 libglvnd-dev \
         libgl1 libgl1-mesa-dev \
@@ -18,37 +17,32 @@ RUN apt-get install -y --no-install-recommends \
         libgles2 libgles2-mesa-dev \
         libxrandr-dev libxinerama-dev libxcursor-dev libxi-dev libssl-dev \
         libaio-dev \
-        wget git ninja-build imagemagick
-# RUN rm -rf /var/lib/apt/lists/*
+        wget git ninja-build imagemagick ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
 
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility,graphics
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,graphics
 ADD https://raw.githubusercontent.com/NVlabs/nvdiffrec/main/docker/10_nvidia.json \
     /usr/share/glvnd/egl_vendor.d/10_nvidia.json
 
-# Install cmake
-RUN wget -qO- "https://cmake.org/files/v3.23/cmake-3.23.2-linux-x86_64.tar.gz" | tar --strip-components=1 -xz -C /usr/local
+# Install CMake (3.24+ required by the standalone build).
+RUN wget -qO- "https://cmake.org/files/v3.28/cmake-3.28.3-linux-x86_64.tar.gz" | tar --strip-components=1 -xz -C /usr/local
 
-# Install tbb
+# Install TBB
 RUN wget -qO- "https://github.com/oneapi-src/oneTBB/releases/download/v2021.9.0/oneapi-tbb-2021.9.0-lin.tgz" | tar --strip-components=1 -xz -C /usr/local
 
-# Create a superbuild
-RUN git clone --recursive https://github.com/VIDILabs/open-volume-renderer.git /instantvnr/ovr
-# RUN git clone --recursive https://github.com/VIDILabs/instantvnr.git /instantvnr/source
-COPY . /instantvnr/source
-RUN ln -s /instantvnr/source /instantvnr/ovr/projects/instantvnr
-
-# Config and build
-RUN mkdir -p /instantvnr/build
-RUN cmake -S /instantvnr/ovr -B/instantvnr/build -GNinja \
-    -DOptiX_INSTALL_DIR=/instantvnr/ovr/github-actions/optix-cmake-github-actions/NVIDIA-OptiX-SDK-7.3.0-linux64-x86_64 \
-    -DGDT_CUDA_ARCHITECTURES=${CUDA_ARCH} \
-    -DOVR_BUILD_MODULE_NNVOLUME=ON \
-    -DOVR_BUILD_DEVICE_OSPRAY=OFF \
-    -DOVR_BUILD_DEVICE_OPTIX7=ON
-RUN cmake --build /instantvnr/build --config Release --parallel 16
-
-RUN ln -s /instantvnr/ovr/data /instantvnr/build/data
-RUN cp /instantvnr/source/example-model.json /instantvnr/build/example-model.json
-
-WORKDIR [ '/instantvnr/build' ]
+WORKDIR /instantvnr
+COPY . /instantvnr
+
+# Configure and build the standalone project directly from this repository.
+RUN SM=${CUDA_ARCH} BUILD_DIR=/instantvnr/build bash ./setup_cmake.sh
+
+RUN BUILD_DIR=/instantvnr/build INSTALL_PREFIX=/instantvnr/install \
+    bash ./setup_cmake.sh --install
+
+RUN ln -s /instantvnr/data /instantvnr/build/data \
+    && cp /instantvnr/example-model.json /instantvnr/build/example-model.json
+
+ENV CMAKE_PREFIX_PATH=/instantvnr/install
+
+WORKDIR /instantvnr/build
diff --git a/README.md b/README.md
index 51d151a..e7f9ebb 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ cd ..
 # Build
 mkdir build
 cd build
-cmake .. -DGDT_CUDA_ARCHITECTURES=86 -DOVR_BUILD_MODULE_NNVOLUME=ON -DOVR_BUILD_DEVICE_OSPRAY=OFF -DOVR_BUILD_DEVICE_OPTIX7=OFF
+cmake .. -DCMAKE_CUDA_ARCHITECTURES=86 -DOVR_BUILD_MODULE_NNVOLUME=ON -DOVR_BUILD_DEVICE_OSPRAY=OFF -DOVR_BUILD_DEVICE_OPTIX7=OFF
 cmake --build . --config Release --parallel 16
 
 # In the binary output directory, setup symbolic links to the data folder
@@ -72,6 +72,29 @@ docker run --gpus device=0 --runtime=nvidia -ti  --rm -e DISPLAY -v /tmp/.X11-un
 
 You can also directly execute apps through the docker container
 
+#### Using from Another CMake Project
+
+You can also install `instantvnr` as a CMake package and consume it from
+another repository:
+
+```bash
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES=86
+cmake --build build --parallel
+cmake --install build --prefix /path/to/instantvnr-install
+```
+
+Then in the downstream project's `CMakeLists.txt`:
+
+```cmake
+find_package(instantvnr CONFIG REQUIRED)
+
+add_executable(my_app main.cpp)
+target_link_libraries(my_app PRIVATE instantvnr::instantvnr)
+```
+
+The exported target preserves the current public include layout, so downstream
+code can continue to include headers such as `#include <api.h>`.
+
 
 ### Citation
 ```bibtex
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index b5d1e33..44e3b6a 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -14,6 +14,22 @@ if(NOT target)
   set(target instantvnr)
 endif()
 
+set(_instantvnr_app_common_include_dirs
+  "${CMAKE_CURRENT_LIST_DIR}/../base/ovr/common"
+  "${PROJECT_BINARY_DIR}/base/ovr/common"
+)
+
+function(ivnr_configure_app app_target)
+  target_link_libraries(${app_target} PRIVATE rendercommon)
+  target_include_directories(${app_target} PRIVATE ${_instantvnr_app_common_include_dirs})
+  target_compile_options(${app_target} PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:${CUDA_NVCC_FLAGS}>
+  )
+  target_link_options(${app_target} PRIVATE
+    "LINKER:--allow-shlib-undefined"
+  )
+endfunction()
+
 # ======================================================================== #
 # applications
 # ======================================================================== #
@@ -21,26 +37,33 @@ endif()
 if(ENABLE_OPENGL)
   add_executable(vnr_int_dual int_dual_volume.cpp)
   target_link_libraries(vnr_int_dual   PRIVATE ${target} ${GFX_LIBRARIES})
+  ivnr_configure_app(vnr_int_dual)
 
   add_executable(vnr_int_single int_volume.cpp)
   target_link_libraries(vnr_int_single PRIVATE ${target} ${GFX_LIBRARIES})
+  ivnr_configure_app(vnr_int_single)
 endif()
 
 add_executable(vnr_cmd_train  batch_trainer.cpp)
 target_link_libraries(vnr_cmd_train PRIVATE ${target})
+ivnr_configure_app(vnr_cmd_train)
 
 add_executable(vnr_cmd_render batch_renderer.cpp)
 target_link_libraries(vnr_cmd_render PRIVATE ${target})
+ivnr_configure_app(vnr_cmd_render)
 
 if(ENABLE_IN_SHADER)
   add_executable(vnr_cmd_isosurface batch_isosurface.cpp)
   target_link_libraries(vnr_cmd_isosurface PRIVATE ${target})
+  ivnr_configure_app(vnr_cmd_isosurface)
 endif()
 
 add_executable(view_model view_model.cpp)
 target_link_libraries(view_model PRIVATE ${target})
+ivnr_configure_app(view_model)
 
 if(ENABLE_IN_SHADER)
   add_executable(vnr_int_isosurface int_isosurface.cu)
   target_link_libraries(vnr_int_isosurface PUBLIC renderlib ${target} ${GFX_LIBRARIES})
+  ivnr_configure_app(vnr_int_isosurface)
 endif()
diff --git a/base b/base
index e13dceb..0920a83 160000
--- a/base
+++ b/base
@@ -1 +1 @@
-Subproject commit e13dcebf2b0da6c4771158118a6d921dd8af22b5
+Subproject commit 0920a83f1a242a71c5da9fa92d3c6c898dc9e39e
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 581dba8..13b3782 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -79,7 +79,7 @@ if(ENABLE_IN_SHADER)
     networks/tcnn_device_network.cu
     marching_cube.cu
   )
-  target_compile_definitions(${target} PUBLIC ENABLE_IN_SHADER=1)
+  target_compile_definitions(${target} PRIVATE ENABLE_IN_SHADER=1)
 endif()
 
 if(ENABLE_OUT_OF_CORE)
@@ -105,14 +105,24 @@ PROPERTIES
 )
 
 target_include_directories(${target} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR})
-target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/..>)
-target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/../..>)
+target_include_directories(${target} PUBLIC
+  $<BUILD_INTERFACE:${INSTANTVNR_SOURCE_DIR}>
+  $<BUILD_INTERFACE:${INSTANTVNR_SOURCE_DIR}/base/gdt>
+  $<BUILD_INTERFACE:${INSTANTVNR_SOURCE_DIR}/base/extern>
+  $<INSTALL_INTERFACE:${INSTANTVNR_INSTALL_INCLUDEDIR}>
+)
 
 target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe="--diag_suppress=177">)   # variable declared but not referenced
 target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe="--diag_suppress=20044">) # extern variable treated as static
 
-target_link_libraries(${target} PRIVATE TBB::tbb curand tiny-cuda-nn)
-target_link_libraries(${target} PUBLIC gdt util tfnmodule)
+target_link_libraries(${target} PRIVATE
+  $<BUILD_INTERFACE:TBB::tbb>
+  $<BUILD_INTERFACE:curand>
+  $<BUILD_INTERFACE:tiny-cuda-nn>
+  $<BUILD_INTERFACE:gdt>
+  rendercommon
+  $<BUILD_INTERFACE:tfnmodule>
+)
 
 # GLIBCXX ABI compile definition — must match the installed PyTorch
 target_compile_definitions(${target} PUBLIC ${IVNR_GLIBCXX_CXX11_ABI})
diff --git a/setup_cmake.sh b/setup_cmake.sh
index 2298129..ad5534e 100644
--- a/setup_cmake.sh
+++ b/setup_cmake.sh
@@ -8,6 +8,8 @@
 #   BUILD_DIR=build ./setup_cmake.sh  # custom build directory
 #   ./setup_cmake.sh --configure   # configure only (skip build)
 #   ./setup_cmake.sh --build       # build only (skip configure)
+#   ./setup_cmake.sh --install     # install only (skip configure and build)
+#   INSTALL_PREFIX=/opt/instantvnr ./setup_cmake.sh --install
 #
 # Requires:
 #   - CUDA toolkit (nvcc in PATH or /usr/local/cuda)
@@ -19,10 +21,12 @@ BUILD_DIR="${BUILD_DIR:-${SCRIPT_DIR}/build}"
 
 DO_CONFIGURE=true
 DO_BUILD=true
+DO_INSTALL=false
 for arg in "$@"; do
   case "$arg" in
     --configure) DO_BUILD=false ;;
     --build)     DO_CONFIGURE=false ;;
+    --install)   DO_INSTALL=true; DO_CONFIGURE=false; DO_BUILD=false ;;
   esac
 done
 
@@ -62,5 +66,13 @@ if [[ "$DO_BUILD" == true ]]; then
     JOBS="${JOBS:-$(nproc)}"
     echo "[info] Building with $JOBS parallel jobs"
     cmake --build "$BUILD_DIR" --config Release -- -j"$JOBS"
-    echo "[info] Build complete. Outputs in $BUILD_DIR/instantvnr/"
+    echo "[info] Build complete. Outputs in $BUILD_DIR/bin/"
+fi
+
+# ── install ───────────────────────────────────────────────────────────────────
+if [[ "$DO_INSTALL" == true ]]; then
+    INSTALL_PREFIX="${INSTALL_PREFIX:-${SCRIPT_DIR}/install}"
+    echo "[info] Installing to $INSTALL_PREFIX"
+    cmake --install "$BUILD_DIR" --prefix "$INSTALL_PREFIX"
+    echo "[info] Install complete."
 fi

From 059a170126ad9b31455bdf6b046bdaf80200581a Mon Sep 17 00:00:00 2001
From: Qi Wu <qiwu@nvidia.com>
Date: Tue, 21 Apr 2026 15:44:55 -0700
Subject: [PATCH 5/5] Update submodule commit for base project

---
 base | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base b/base
index 0920a83..ff40f12 160000
--- a/base
+++ b/base
@@ -1 +1 @@
-Subproject commit 0920a83f1a242a71c5da9fa92d3c6c898dc9e39e
+Subproject commit ff40f1267b9cbfa2c2338899910fce07538d20e9