NVIDIA · xupinjie · May 20, 2026 · May 20, 2026
diff --git a/build_config/accvlab_build_config/helpers/build_utils.py b/build_config/accvlab_build_config/helpers/build_utils.py
@@ -31,8 +31,8 @@ class CudaArchitectureSelection(NamedTuple):
 
     Attributes:
         architectures: CUDA architectures to build as cubin targets.
-        ptx_architectures: At most one architecture to build as a PTX target
-            because a detected GPU architecture had to be capped.
+        ptx_architectures: CUDA architectures to build as PTX targets because
+            detected GPU architectures were not exact ``nvcc`` cubin targets.
     """
 
     architectures: List[str]
@@ -91,8 +91,8 @@ def _split_cuda_architectures(value: str) -> List[str]:
     return [arch.strip() for arch in re.split(r"[,;]", value) if arch.strip()]
 
 
-def _forward_compatible_ptx_architecture(
-    supported_architectures: List[str], max_architecture: int
+def _supported_ptx_fallback_architecture(
+    supported_architectures: List[str], detected_architecture: int
 ) -> Optional[str]:
     forward_compatible_archs: List[str] = []
     fallback_archs: List[str] = []
@@ -102,7 +102,7 @@ def _forward_compatible_ptx_architecture(
         except ValueError:
             continue
 
-        if arch_int > max_architecture:
+        if arch_int > detected_architecture:
             continue
 
         fallback_archs.append(arch)
@@ -121,47 +121,52 @@ def select_cuda_architectures_for_nvcc(
 ) -> CudaArchitectureSelection:
     """Select CUDA cubin and PTX targets supported by the installed ``nvcc``.
 
-    Numeric architectures above ``nvcc``'s maximum supported architecture are
-    capped to that maximum. When capping occurs, one PTX target is added using
-    the newest forward-compatible base architecture supported by ``nvcc`` at or
-    below the capped architecture. For example, if the highest supported
-    architecture is ``96``, the PTX target is ``90``.
+    A detected architecture is emitted as a cubin target only when
+    ``nvcc --list-gpu-arch`` reports that exact architecture. Unsupported
+    detected architectures use a PTX fallback at or below the detected
+    architecture, preferring the newest supported base architecture where the
+    architecture number is divisible by 10.
 
     Args:
         cuda_architectures: CUDA architecture numbers to select from, for
             example ``["80", "90", "103"]``.
 
     Returns:
-        CudaArchitectureSelection: The capped cubin architectures and, when
-        capping occurred, the single architecture to emit as a PTX target. If
-        ``nvcc`` cannot be found or queried, the input architectures are returned
-        unchanged and no PTX targets are added.
+        CudaArchitectureSelection: The exact cubin architectures and any PTX
+        fallback architectures. If ``nvcc`` cannot be found or queried, the
+        input architectures are returned unchanged and no PTX targets are added.
     """
     supported_archs = _detect_nvcc_supported_architectures()
     if not cuda_architectures or not supported_archs:
         return CudaArchitectureSelection(cuda_architectures, [])
 
-    max_supported = max(int(arch) for arch in supported_archs)
-    capped_archs: List[str] = []
-    any_arch_capped = False
+    supported_arch_set = set(supported_archs)
+    selected_archs: List[str] = []
+    ptx_archs: List[str] = []
+
     for arch in cuda_architectures:
+        if arch in supported_arch_set:
+            if arch not in selected_archs:
+                selected_archs.append(arch)
+            continue
+
         try:
             arch_int = int(arch)
-            capped_arch = str(min(arch_int, max_supported))
-            any_arch_capped = any_arch_capped or arch_int > max_supported
         except ValueError:
-            capped_arch = arch
+            if arch not in selected_archs:
+                selected_archs.append(arch)
+            continue
 
-        if capped_arch not in capped_archs:
-            capped_archs.append(capped_arch)
+        ptx_arch = _supported_ptx_fallback_architecture(supported_archs, arch_int)
+        if ptx_arch is None:
+            if arch not in selected_archs:
+                selected_archs.append(arch)
+            continue
 
-    ptx_archs: List[str] = []
-    if any_arch_capped:
-        ptx_arch = _forward_compatible_ptx_architecture(supported_archs, max_supported)
-        if ptx_arch is not None:
+        if ptx_arch not in ptx_archs:
             ptx_archs.append(ptx_arch)
 
-    return CudaArchitectureSelection(capped_archs, ptx_archs)
+    return CudaArchitectureSelection(selected_archs, ptx_archs)
 
 
 def missing_torch_error() -> RuntimeError:
@@ -294,11 +299,11 @@ def detect_cuda_info():
 def get_compile_flags(config, cuda_info, include_dirs=None):
     """Construct compilation flags.
 
-    If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures are capped to
-    the maximum supported by ``nvcc``. If any architecture is capped, the newest
-    forward-compatible base architecture supported by ``nvcc`` is also emitted
-    as a PTX target. If no architecture can be detected, no explicit CUDA
-    architecture flags are generated.
+    If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures are emitted
+    as cubin targets only when ``nvcc`` reports exact support. Unsupported
+    detections fall back to supported PTX at or below the detected architecture.
+    If no architecture can be detected, no explicit CUDA architecture flags are
+    generated.
 
     Args:
         config (dict): Build configuration

diff --git a/build_config/accvlab_build_config/helpers/cmake_args.py b/build_config/accvlab_build_config/helpers/cmake_args.py
@@ -94,10 +94,10 @@ def _build_cmake_args_from_env() -> List[str]:
     Build a list of -D CMake arguments from environment variables to harmonize
     build configuration across setuptools, external CMake, and scikit-build flows.
 
-    If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures are capped to
-    the maximum supported by ``nvcc``. If capping occurs, CMake builds cubins for
-    the capped architectures and adds one PTX target for the newest supported
-    forward-compatible base architecture.
+    If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures become CMake
+    real targets only when ``nvcc`` reports exact support. Unsupported
+    detections use supported virtual/PTX targets at or below the detected
+    architecture.
     """
     args: List[str] = []
     # Always export compile_commands.json for tooling/validation
@@ -196,9 +196,9 @@ def build_cmake_args() -> List[str]:
     """
     Full CMake -D list: environment-based flags plus repo-aligned SCM version define.
 
-    Auto-detected CUDA architectures are capped to ``nvcc`` support when
-    ``CUSTOM_CUDA_ARCHS`` is unset. If capping occurs, one PTX target is emitted
-    for the newest supported forward-compatible base architecture.
+    Auto-detected CUDA architectures use exact ``nvcc`` real targets when
+    supported. Unsupported detections fall back to supported PTX targets at or
+    below the detected architecture when ``CUSTOM_CUDA_ARCHS`` is unset.
     """
     root = get_project_root()
     return _build_cmake_args_from_env() + _build_cmake_args_package_scm_version(root)

diff --git a/build_config/tests/test_cuda_arch_selection.py b/build_config/tests/test_cuda_arch_selection.py
@@ -0,0 +1,95 @@
+import unittest
+from unittest import mock
+
+from accvlab_build_config.helpers import build_utils
+
+
+class CudaArchSelectionTest(unittest.TestCase):
+    def _mock_supported_architectures(self, supported_architectures):
+        return mock.patch.object(
+            build_utils,
+            "_detect_nvcc_supported_architectures",
+            return_value=supported_architectures,
+        )
+
+    def test_exact_supported_architecture_uses_real_target(self):
+        with self._mock_supported_architectures(["80", "90", "100"]):
+            selection = build_utils.select_cuda_architectures_for_nvcc(["90"])
+
+        self.assertEqual(selection.architectures, ["90"])
+        self.assertEqual(selection.ptx_architectures, [])
+
+    def test_unsupported_hole_uses_base_ptx_not_nearby_real(self):
+        with self._mock_supported_architectures(["100", "120"]):
+            selection = build_utils.select_cuda_architectures_for_nvcc(["103"])
+
+        self.assertEqual(selection.architectures, [])
+        self.assertEqual(selection.ptx_architectures, ["100"])
+
+    def test_future_gpu_uses_supported_base_ptx_below_detection(self):
+        with self._mock_supported_architectures(["80", "90", "100", "103", "120"]):
+            selection = build_utils.select_cuda_architectures_for_nvcc(["121"])
+
+        self.assertEqual(selection.architectures, [])
+        self.assertEqual(selection.ptx_architectures, ["120"])
+
+    def test_unsupported_detection_uses_greatest_supported_ptx_without_base(self):
+        with self._mock_supported_architectures(["75", "86", "89"]):
+            selection = build_utils.select_cuda_architectures_for_nvcc(["88"])
+
+        self.assertEqual(selection.architectures, [])
+        self.assertEqual(selection.ptx_architectures, ["86"])
+
+    def test_unsupported_detection_without_lower_support_remains_unchanged(self):
+        with self._mock_supported_architectures(["60", "70"]):
+            selection = build_utils.select_cuda_architectures_for_nvcc(["50"])
+
+        self.assertEqual(selection.architectures, ["50"])
+        self.assertEqual(selection.ptx_architectures, [])
+
+    def test_mixed_exact_and_unsupported_architectures_preserve_order(self):
+        with self._mock_supported_architectures(["80", "90", "100", "120"]):
+            selection = build_utils.select_cuda_architectures_for_nvcc(["90", "103"])
+
+        self.assertEqual(selection.architectures, ["90"])
+        self.assertEqual(selection.ptx_architectures, ["100"])
+
+    def test_duplicate_ptx_fallbacks_are_deduplicated(self):
+        with self._mock_supported_architectures(["100", "120"]):
+            selection = build_utils.select_cuda_architectures_for_nvcc(["103", "103"])
+
+        self.assertEqual(selection.architectures, [])
+        self.assertEqual(selection.ptx_architectures, ["100"])
+
+    def test_no_detected_nvcc_architectures_returns_input_unchanged(self):
+        with self._mock_supported_architectures([]):
+            selection = build_utils.select_cuda_architectures_for_nvcc(["103"])
+
+        self.assertEqual(selection.architectures, ["103"])
+        self.assertEqual(selection.ptx_architectures, [])
+
+    def test_explicit_custom_architectures_are_not_rewritten(self):
+        with mock.patch.object(
+            build_utils,
+            "select_cuda_architectures_for_nvcc",
+            side_effect=AssertionError("unexpected selector call"),
+        ):
+            config = {
+                "CPP_STANDARD": "c++17",
+                "OPTIMIZE_LEVEL": 3,
+                "USE_FAST_MATH": False,
+                "DEBUG_BUILD": False,
+                "ENABLE_PROFILING": False,
+                "VERBOSE_BUILD": False,
+                "CUSTOM_CUDA_ARCHS": ["103"],
+            }
+            cuda_info = {"cuda_available": True, "gpu_architectures": ["103"]}
+
+            flags = build_utils.get_compile_flags(config, cuda_info)
+
+        self.assertIn("-gencode=arch=compute_103,code=sm_103", flags["nvcc"])
+        self.assertNotIn("-gencode=arch=compute_100,code=compute_100", flags["nvcc"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -90,13 +90,13 @@ RUN pip install torch==2.11.0 torchvision==0.26.0 torchaudio==2.11.0 --index-url
 
 RUN pip install black
 
-RUN pip install sphinx \
-    sphinx-rtd-theme \
-    sphinx-autodoc-typehints \
-    sphinx-autobuild \
-    linkify-it-py \
-    myst-parser \
-    sphinxcontrib-spelling
+RUN pip install sphinx==8.1.3 \
+    sphinx-rtd-theme==3.1.0 \
+    sphinx-autodoc-typehints==3.0.1 \
+    sphinx-autobuild==2024.10.3 \
+    linkify-it-py==2.1.0 \
+    myst-parser==4.0.1 \
+    sphinxcontrib-spelling==8.0.2
 
 RUN pip install ninja \
     scikit-build

diff --git a/docs/guides/INSTALLATION_GUIDE.md b/docs/guides/INSTALLATION_GUIDE.md
@@ -406,7 +406,7 @@ ENABLE_PROFILING=1 ./scripts/package_manager.sh install
 | `OPTIMIZE_LEVEL` | int: `0`–`3` | `3` | Compiler optimization level |
 | `CPP_STANDARD` | string: `c++17` | `c++17` | C++ standard to use |
 | `VERBOSE_BUILD` | bool: `0`/`1`, `true`/`false`, `yes`/`no`, `on`/`off` | `0` | Show detailed build output |
-| `CUSTOM_CUDA_ARCHS` | list: e.g. `"70,75,80"` or `"75;80;86"` | PyTorch auto-detect, then package default | Target CUDA architectures |
+| `CUSTOM_CUDA_ARCHS` | list: e.g. `"70,75,80"` or `"75;80;86"` | PyTorch auto-detect, then package default | Explicit CUDA architecture override |
 | `USE_FAST_MATH` | bool: `0`/`1`, `true`/`false`, `yes`/`no`, `on`/`off` | `1` | Enable fast math optimizations |
 | `ENABLE_PROFILING` | bool: `0`/`1`, `true`/`false`, `yes`/`no`, `on`/`off` | `0` | Enable profiling support |
 
@@ -418,10 +418,13 @@ ENABLE_PROFILING=1 ./scripts/package_manager.sh install
 > GPU architectures via CUDA-enabled PyTorch. Missing PyTorch or CPU-only PyTorch is treated as a build
 > configuration error.
 >
-> Auto-detected architectures are capped to the maximum architecture supported by the
-> installed `nvcc`, which avoids selecting a GPU architecture that is newer than the CUDA toolkit used for
-> the build. When an architecture is capped, the build also includes one PTX target for the newest supported
-> forward-compatible base architecture.
+> Auto-detected architectures are emitted as real/cubin targets only when the installed `nvcc` exactly supports
+> them. If a detected architecture is unsupported, ACCV-Lab emits a supported PTX target below the detected
+> architecture, preferring base architectures whose number is divisible by 10 (for example, `100` for a
+> detected `103` architecture).
+>
+> `CUSTOM_CUDA_ARCHS` is an explicit override. When it is set, ACCV-Lab passes those architectures through
+> unchanged instead of applying the auto-detection fallback logic.
 >
 > If PyTorch is CUDA-enabled but no architecture can be detected
 > (for example because no CUDA device is visible), ACCV-Lab does not pass `CMAKE_CUDA_ARCHITECTURES`;

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,6 +1,6 @@
-sphinx>=5.0.0
-sphinx-rtd-theme>=1.0.0
-sphinx-autodoc-typehints>=1.19.0
-myst-parser>=0.18.0
-linkify-it-py>=2.0.0
-sphinx-autobuild
+sphinx>=8.1.3
+sphinx-rtd-theme>=3.1.0
+sphinx-autodoc-typehints>=3.0.1
+myst-parser>=4.0.1
+linkify-it-py>=2.1.0
+sphinx-autobuild>=2024.10.3
diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
@@ -211,3 +211,4 @@ picklable
 ABI
 aggregator
 multimodal
+cubin
-Original file line number
+Diff line change
@@ Expand Up / @@ -211,3 +211,4 @@ picklable @@
     ABI
     aggregator
     multimodal
+    cubin