diff --git a/build_config/accvlab_build_config/helpers/build_utils.py b/build_config/accvlab_build_config/helpers/build_utils.py index 05544b2..6db8e7f 100644 --- a/build_config/accvlab_build_config/helpers/build_utils.py +++ b/build_config/accvlab_build_config/helpers/build_utils.py @@ -31,8 +31,8 @@ class CudaArchitectureSelection(NamedTuple): Attributes: architectures: CUDA architectures to build as cubin targets. - ptx_architectures: At most one architecture to build as a PTX target - because a detected GPU architecture had to be capped. + ptx_architectures: CUDA architectures to build as PTX targets because + detected GPU architectures were not exact ``nvcc`` cubin targets. """ architectures: List[str] @@ -91,8 +91,8 @@ def _split_cuda_architectures(value: str) -> List[str]: return [arch.strip() for arch in re.split(r"[,;]", value) if arch.strip()] -def _forward_compatible_ptx_architecture( - supported_architectures: List[str], max_architecture: int +def _supported_ptx_fallback_architecture( + supported_architectures: List[str], detected_architecture: int ) -> Optional[str]: forward_compatible_archs: List[str] = [] fallback_archs: List[str] = [] @@ -102,7 +102,7 @@ def _forward_compatible_ptx_architecture( except ValueError: continue - if arch_int > max_architecture: + if arch_int > detected_architecture: continue fallback_archs.append(arch) @@ -121,47 +121,52 @@ def select_cuda_architectures_for_nvcc( ) -> CudaArchitectureSelection: """Select CUDA cubin and PTX targets supported by the installed ``nvcc``. - Numeric architectures above ``nvcc``'s maximum supported architecture are - capped to that maximum. When capping occurs, one PTX target is added using - the newest forward-compatible base architecture supported by ``nvcc`` at or - below the capped architecture. For example, if the highest supported - architecture is ``96``, the PTX target is ``90``. + A detected architecture is emitted as a cubin target only when + ``nvcc --list-gpu-arch`` reports that exact architecture. Unsupported + detected architectures use a PTX fallback at or below the detected + architecture, preferring the newest supported base architecture where the + architecture number is divisible by 10. Args: cuda_architectures: CUDA architecture numbers to select from, for example ``["80", "90", "103"]``. Returns: - CudaArchitectureSelection: The capped cubin architectures and, when - capping occurred, the single architecture to emit as a PTX target. If - ``nvcc`` cannot be found or queried, the input architectures are returned - unchanged and no PTX targets are added. + CudaArchitectureSelection: The exact cubin architectures and any PTX + fallback architectures. If ``nvcc`` cannot be found or queried, the + input architectures are returned unchanged and no PTX targets are added. """ supported_archs = _detect_nvcc_supported_architectures() if not cuda_architectures or not supported_archs: return CudaArchitectureSelection(cuda_architectures, []) - max_supported = max(int(arch) for arch in supported_archs) - capped_archs: List[str] = [] - any_arch_capped = False + supported_arch_set = set(supported_archs) + selected_archs: List[str] = [] + ptx_archs: List[str] = [] + for arch in cuda_architectures: + if arch in supported_arch_set: + if arch not in selected_archs: + selected_archs.append(arch) + continue + try: arch_int = int(arch) - capped_arch = str(min(arch_int, max_supported)) - any_arch_capped = any_arch_capped or arch_int > max_supported except ValueError: - capped_arch = arch + if arch not in selected_archs: + selected_archs.append(arch) + continue - if capped_arch not in capped_archs: - capped_archs.append(capped_arch) + ptx_arch = _supported_ptx_fallback_architecture(supported_archs, arch_int) + if ptx_arch is None: + if arch not in selected_archs: + selected_archs.append(arch) + continue - ptx_archs: List[str] = [] - if any_arch_capped: - ptx_arch = _forward_compatible_ptx_architecture(supported_archs, max_supported) - if ptx_arch is not None: + if ptx_arch not in ptx_archs: ptx_archs.append(ptx_arch) - return CudaArchitectureSelection(capped_archs, ptx_archs) + return CudaArchitectureSelection(selected_archs, ptx_archs) def missing_torch_error() -> RuntimeError: @@ -294,11 +299,11 @@ def detect_cuda_info(): def get_compile_flags(config, cuda_info, include_dirs=None): """Construct compilation flags. - If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures are capped to - the maximum supported by ``nvcc``. If any architecture is capped, the newest - forward-compatible base architecture supported by ``nvcc`` is also emitted - as a PTX target. If no architecture can be detected, no explicit CUDA - architecture flags are generated. + If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures are emitted + as cubin targets only when ``nvcc`` reports exact support. Unsupported + detections fall back to supported PTX at or below the detected architecture. + If no architecture can be detected, no explicit CUDA architecture flags are + generated. Args: config (dict): Build configuration diff --git a/build_config/accvlab_build_config/helpers/cmake_args.py b/build_config/accvlab_build_config/helpers/cmake_args.py index bcfba8b..41c5feb 100644 --- a/build_config/accvlab_build_config/helpers/cmake_args.py +++ b/build_config/accvlab_build_config/helpers/cmake_args.py @@ -94,10 +94,10 @@ def _build_cmake_args_from_env() -> List[str]: Build a list of -D CMake arguments from environment variables to harmonize build configuration across setuptools, external CMake, and scikit-build flows. - If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures are capped to - the maximum supported by ``nvcc``. If capping occurs, CMake builds cubins for - the capped architectures and adds one PTX target for the newest supported - forward-compatible base architecture. + If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures become CMake + real targets only when ``nvcc`` reports exact support. Unsupported + detections use supported virtual/PTX targets at or below the detected + architecture. """ args: List[str] = [] # Always export compile_commands.json for tooling/validation @@ -196,9 +196,9 @@ def build_cmake_args() -> List[str]: """ Full CMake -D list: environment-based flags plus repo-aligned SCM version define. - Auto-detected CUDA architectures are capped to ``nvcc`` support when - ``CUSTOM_CUDA_ARCHS`` is unset. If capping occurs, one PTX target is emitted - for the newest supported forward-compatible base architecture. + Auto-detected CUDA architectures use exact ``nvcc`` real targets when + supported. Unsupported detections fall back to supported PTX targets at or + below the detected architecture when ``CUSTOM_CUDA_ARCHS`` is unset. """ root = get_project_root() return _build_cmake_args_from_env() + _build_cmake_args_package_scm_version(root) diff --git a/build_config/tests/test_cuda_arch_selection.py b/build_config/tests/test_cuda_arch_selection.py new file mode 100644 index 0000000..765d94f --- /dev/null +++ b/build_config/tests/test_cuda_arch_selection.py @@ -0,0 +1,95 @@ +import unittest +from unittest import mock + +from accvlab_build_config.helpers import build_utils + + +class CudaArchSelectionTest(unittest.TestCase): + def _mock_supported_architectures(self, supported_architectures): + return mock.patch.object( + build_utils, + "_detect_nvcc_supported_architectures", + return_value=supported_architectures, + ) + + def test_exact_supported_architecture_uses_real_target(self): + with self._mock_supported_architectures(["80", "90", "100"]): + selection = build_utils.select_cuda_architectures_for_nvcc(["90"]) + + self.assertEqual(selection.architectures, ["90"]) + self.assertEqual(selection.ptx_architectures, []) + + def test_unsupported_hole_uses_base_ptx_not_nearby_real(self): + with self._mock_supported_architectures(["100", "120"]): + selection = build_utils.select_cuda_architectures_for_nvcc(["103"]) + + self.assertEqual(selection.architectures, []) + self.assertEqual(selection.ptx_architectures, ["100"]) + + def test_future_gpu_uses_supported_base_ptx_below_detection(self): + with self._mock_supported_architectures(["80", "90", "100", "103", "120"]): + selection = build_utils.select_cuda_architectures_for_nvcc(["121"]) + + self.assertEqual(selection.architectures, []) + self.assertEqual(selection.ptx_architectures, ["120"]) + + def test_unsupported_detection_uses_greatest_supported_ptx_without_base(self): + with self._mock_supported_architectures(["75", "86", "89"]): + selection = build_utils.select_cuda_architectures_for_nvcc(["88"]) + + self.assertEqual(selection.architectures, []) + self.assertEqual(selection.ptx_architectures, ["86"]) + + def test_unsupported_detection_without_lower_support_remains_unchanged(self): + with self._mock_supported_architectures(["60", "70"]): + selection = build_utils.select_cuda_architectures_for_nvcc(["50"]) + + self.assertEqual(selection.architectures, ["50"]) + self.assertEqual(selection.ptx_architectures, []) + + def test_mixed_exact_and_unsupported_architectures_preserve_order(self): + with self._mock_supported_architectures(["80", "90", "100", "120"]): + selection = build_utils.select_cuda_architectures_for_nvcc(["90", "103"]) + + self.assertEqual(selection.architectures, ["90"]) + self.assertEqual(selection.ptx_architectures, ["100"]) + + def test_duplicate_ptx_fallbacks_are_deduplicated(self): + with self._mock_supported_architectures(["100", "120"]): + selection = build_utils.select_cuda_architectures_for_nvcc(["103", "103"]) + + self.assertEqual(selection.architectures, []) + self.assertEqual(selection.ptx_architectures, ["100"]) + + def test_no_detected_nvcc_architectures_returns_input_unchanged(self): + with self._mock_supported_architectures([]): + selection = build_utils.select_cuda_architectures_for_nvcc(["103"]) + + self.assertEqual(selection.architectures, ["103"]) + self.assertEqual(selection.ptx_architectures, []) + + def test_explicit_custom_architectures_are_not_rewritten(self): + with mock.patch.object( + build_utils, + "select_cuda_architectures_for_nvcc", + side_effect=AssertionError("unexpected selector call"), + ): + config = { + "CPP_STANDARD": "c++17", + "OPTIMIZE_LEVEL": 3, + "USE_FAST_MATH": False, + "DEBUG_BUILD": False, + "ENABLE_PROFILING": False, + "VERBOSE_BUILD": False, + "CUSTOM_CUDA_ARCHS": ["103"], + } + cuda_info = {"cuda_available": True, "gpu_architectures": ["103"]} + + flags = build_utils.get_compile_flags(config, cuda_info) + + self.assertIn("-gencode=arch=compute_103,code=sm_103", flags["nvcc"]) + self.assertNotIn("-gencode=arch=compute_100,code=compute_100", flags["nvcc"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/docker/Dockerfile b/docker/Dockerfile index 6ca5f14..6ae40b1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -90,13 +90,13 @@ RUN pip install torch==2.11.0 torchvision==0.26.0 torchaudio==2.11.0 --index-url RUN pip install black -RUN pip install sphinx \ - sphinx-rtd-theme \ - sphinx-autodoc-typehints \ - sphinx-autobuild \ - linkify-it-py \ - myst-parser \ - sphinxcontrib-spelling +RUN pip install sphinx==8.1.3 \ + sphinx-rtd-theme==3.1.0 \ + sphinx-autodoc-typehints==3.0.1 \ + sphinx-autobuild==2024.10.3 \ + linkify-it-py==2.1.0 \ + myst-parser==4.0.1 \ + sphinxcontrib-spelling==8.0.2 RUN pip install ninja \ scikit-build diff --git a/docs/guides/INSTALLATION_GUIDE.md b/docs/guides/INSTALLATION_GUIDE.md index f9c261f..df0fe2e 100644 --- a/docs/guides/INSTALLATION_GUIDE.md +++ b/docs/guides/INSTALLATION_GUIDE.md @@ -406,7 +406,7 @@ ENABLE_PROFILING=1 ./scripts/package_manager.sh install | `OPTIMIZE_LEVEL` | int: `0`–`3` | `3` | Compiler optimization level | | `CPP_STANDARD` | string: `c++17` | `c++17` | C++ standard to use | | `VERBOSE_BUILD` | bool: `0`/`1`, `true`/`false`, `yes`/`no`, `on`/`off` | `0` | Show detailed build output | -| `CUSTOM_CUDA_ARCHS` | list: e.g. `"70,75,80"` or `"75;80;86"` | PyTorch auto-detect, then package default | Target CUDA architectures | +| `CUSTOM_CUDA_ARCHS` | list: e.g. `"70,75,80"` or `"75;80;86"` | PyTorch auto-detect, then package default | Explicit CUDA architecture override | | `USE_FAST_MATH` | bool: `0`/`1`, `true`/`false`, `yes`/`no`, `on`/`off` | `1` | Enable fast math optimizations | | `ENABLE_PROFILING` | bool: `0`/`1`, `true`/`false`, `yes`/`no`, `on`/`off` | `0` | Enable profiling support | @@ -418,10 +418,13 @@ ENABLE_PROFILING=1 ./scripts/package_manager.sh install > GPU architectures via CUDA-enabled PyTorch. Missing PyTorch or CPU-only PyTorch is treated as a build > configuration error. > -> Auto-detected architectures are capped to the maximum architecture supported by the -> installed `nvcc`, which avoids selecting a GPU architecture that is newer than the CUDA toolkit used for -> the build. When an architecture is capped, the build also includes one PTX target for the newest supported -> forward-compatible base architecture. +> Auto-detected architectures are emitted as real/cubin targets only when the installed `nvcc` exactly supports +> them. If a detected architecture is unsupported, ACCV-Lab emits a supported PTX target below the detected +> architecture, preferring base architectures whose number is divisible by 10 (for example, `100` for a +> detected `103` architecture). +> +> `CUSTOM_CUDA_ARCHS` is an explicit override. When it is set, ACCV-Lab passes those architectures through +> unchanged instead of applying the auto-detection fallback logic. > > If PyTorch is CUDA-enabled but no architecture can be detected > (for example because no CUDA device is visible), ACCV-Lab does not pass `CMAKE_CUDA_ARCHITECTURES`; diff --git a/docs/requirements.txt b/docs/requirements.txt index f3c78e2..2094f71 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,6 @@ -sphinx>=5.0.0 -sphinx-rtd-theme>=1.0.0 -sphinx-autodoc-typehints>=1.19.0 -myst-parser>=0.18.0 -linkify-it-py>=2.0.0 -sphinx-autobuild \ No newline at end of file +sphinx>=8.1.3 +sphinx-rtd-theme>=3.1.0 +sphinx-autodoc-typehints>=3.0.1 +myst-parser>=4.0.1 +linkify-it-py>=2.1.0 +sphinx-autobuild>=2024.10.3 \ No newline at end of file diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 4600077..bee6c51 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -211,3 +211,4 @@ picklable ABI aggregator multimodal +cubin