Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 37 additions & 32 deletions build_config/accvlab_build_config/helpers/build_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class CudaArchitectureSelection(NamedTuple):

Attributes:
architectures: CUDA architectures to build as cubin targets.
ptx_architectures: At most one architecture to build as a PTX target
because a detected GPU architecture had to be capped.
ptx_architectures: CUDA architectures to build as PTX targets because
detected GPU architectures were not exact ``nvcc`` cubin targets.
"""

architectures: List[str]
Expand Down Expand Up @@ -91,8 +91,8 @@ def _split_cuda_architectures(value: str) -> List[str]:
return [arch.strip() for arch in re.split(r"[,;]", value) if arch.strip()]


def _forward_compatible_ptx_architecture(
supported_architectures: List[str], max_architecture: int
def _supported_ptx_fallback_architecture(
supported_architectures: List[str], detected_architecture: int
) -> Optional[str]:
forward_compatible_archs: List[str] = []
fallback_archs: List[str] = []
Expand All @@ -102,7 +102,7 @@ def _forward_compatible_ptx_architecture(
except ValueError:
continue

if arch_int > max_architecture:
if arch_int > detected_architecture:
continue

fallback_archs.append(arch)
Expand All @@ -121,47 +121,52 @@ def select_cuda_architectures_for_nvcc(
) -> CudaArchitectureSelection:
"""Select CUDA cubin and PTX targets supported by the installed ``nvcc``.

Numeric architectures above ``nvcc``'s maximum supported architecture are
capped to that maximum. When capping occurs, one PTX target is added using
the newest forward-compatible base architecture supported by ``nvcc`` at or
below the capped architecture. For example, if the highest supported
architecture is ``96``, the PTX target is ``90``.
A detected architecture is emitted as a cubin target only when
``nvcc --list-gpu-arch`` reports that exact architecture. Unsupported
detected architectures use a PTX fallback at or below the detected
architecture, preferring the newest supported base architecture where the
architecture number is divisible by 10.

Args:
cuda_architectures: CUDA architecture numbers to select from, for
example ``["80", "90", "103"]``.

Returns:
CudaArchitectureSelection: The capped cubin architectures and, when
capping occurred, the single architecture to emit as a PTX target. If
``nvcc`` cannot be found or queried, the input architectures are returned
unchanged and no PTX targets are added.
CudaArchitectureSelection: The exact cubin architectures and any PTX
fallback architectures. If ``nvcc`` cannot be found or queried, the
input architectures are returned unchanged and no PTX targets are added.
"""
supported_archs = _detect_nvcc_supported_architectures()
if not cuda_architectures or not supported_archs:
return CudaArchitectureSelection(cuda_architectures, [])

max_supported = max(int(arch) for arch in supported_archs)
capped_archs: List[str] = []
any_arch_capped = False
supported_arch_set = set(supported_archs)
selected_archs: List[str] = []
ptx_archs: List[str] = []

for arch in cuda_architectures:
if arch in supported_arch_set:
if arch not in selected_archs:
selected_archs.append(arch)
continue

try:
arch_int = int(arch)
capped_arch = str(min(arch_int, max_supported))
any_arch_capped = any_arch_capped or arch_int > max_supported
except ValueError:
capped_arch = arch
if arch not in selected_archs:
selected_archs.append(arch)
continue

if capped_arch not in capped_archs:
capped_archs.append(capped_arch)
ptx_arch = _supported_ptx_fallback_architecture(supported_archs, arch_int)
if ptx_arch is None:
if arch not in selected_archs:
selected_archs.append(arch)
continue

ptx_archs: List[str] = []
if any_arch_capped:
ptx_arch = _forward_compatible_ptx_architecture(supported_archs, max_supported)
if ptx_arch is not None:
if ptx_arch not in ptx_archs:
ptx_archs.append(ptx_arch)

return CudaArchitectureSelection(capped_archs, ptx_archs)
return CudaArchitectureSelection(selected_archs, ptx_archs)


def missing_torch_error() -> RuntimeError:
Expand Down Expand Up @@ -294,11 +299,11 @@ def detect_cuda_info():
def get_compile_flags(config, cuda_info, include_dirs=None):
"""Construct compilation flags.

If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures are capped to
the maximum supported by ``nvcc``. If any architecture is capped, the newest
forward-compatible base architecture supported by ``nvcc`` is also emitted
as a PTX target. If no architecture can be detected, no explicit CUDA
architecture flags are generated.
If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures are emitted
as cubin targets only when ``nvcc`` reports exact support. Unsupported
detections fall back to supported PTX at or below the detected architecture.
If no architecture can be detected, no explicit CUDA architecture flags are
generated.

Args:
config (dict): Build configuration
Expand Down
14 changes: 7 additions & 7 deletions build_config/accvlab_build_config/helpers/cmake_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,10 @@ def _build_cmake_args_from_env() -> List[str]:
Build a list of -D CMake arguments from environment variables to harmonize
build configuration across setuptools, external CMake, and scikit-build flows.

If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures are capped to
the maximum supported by ``nvcc``. If capping occurs, CMake builds cubins for
the capped architectures and adds one PTX target for the newest supported
forward-compatible base architecture.
If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures become CMake
real targets only when ``nvcc`` reports exact support. Unsupported
detections use supported virtual/PTX targets at or below the detected
architecture.
"""
args: List[str] = []
# Always export compile_commands.json for tooling/validation
Expand Down Expand Up @@ -196,9 +196,9 @@ def build_cmake_args() -> List[str]:
"""
Full CMake -D list: environment-based flags plus repo-aligned SCM version define.

Auto-detected CUDA architectures are capped to ``nvcc`` support when
``CUSTOM_CUDA_ARCHS`` is unset. If capping occurs, one PTX target is emitted
for the newest supported forward-compatible base architecture.
Auto-detected CUDA architectures use exact ``nvcc`` real targets when
supported. Unsupported detections fall back to supported PTX targets at or
below the detected architecture when ``CUSTOM_CUDA_ARCHS`` is unset.
"""
root = get_project_root()
return _build_cmake_args_from_env() + _build_cmake_args_package_scm_version(root)
Expand Down
95 changes: 95 additions & 0 deletions build_config/tests/test_cuda_arch_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import unittest
from unittest import mock

from accvlab_build_config.helpers import build_utils


class CudaArchSelectionTest(unittest.TestCase):
def _mock_supported_architectures(self, supported_architectures):
return mock.patch.object(
build_utils,
"_detect_nvcc_supported_architectures",
return_value=supported_architectures,
)

def test_exact_supported_architecture_uses_real_target(self):
with self._mock_supported_architectures(["80", "90", "100"]):
selection = build_utils.select_cuda_architectures_for_nvcc(["90"])

self.assertEqual(selection.architectures, ["90"])
self.assertEqual(selection.ptx_architectures, [])

def test_unsupported_hole_uses_base_ptx_not_nearby_real(self):
with self._mock_supported_architectures(["100", "120"]):
selection = build_utils.select_cuda_architectures_for_nvcc(["103"])

self.assertEqual(selection.architectures, [])
self.assertEqual(selection.ptx_architectures, ["100"])

def test_future_gpu_uses_supported_base_ptx_below_detection(self):
with self._mock_supported_architectures(["80", "90", "100", "103", "120"]):
selection = build_utils.select_cuda_architectures_for_nvcc(["121"])

self.assertEqual(selection.architectures, [])
self.assertEqual(selection.ptx_architectures, ["120"])

def test_unsupported_detection_uses_greatest_supported_ptx_without_base(self):
with self._mock_supported_architectures(["75", "86", "89"]):
selection = build_utils.select_cuda_architectures_for_nvcc(["88"])

self.assertEqual(selection.architectures, [])
self.assertEqual(selection.ptx_architectures, ["86"])

def test_unsupported_detection_without_lower_support_remains_unchanged(self):
with self._mock_supported_architectures(["60", "70"]):
selection = build_utils.select_cuda_architectures_for_nvcc(["50"])

self.assertEqual(selection.architectures, ["50"])
self.assertEqual(selection.ptx_architectures, [])

def test_mixed_exact_and_unsupported_architectures_preserve_order(self):
with self._mock_supported_architectures(["80", "90", "100", "120"]):
selection = build_utils.select_cuda_architectures_for_nvcc(["90", "103"])

self.assertEqual(selection.architectures, ["90"])
self.assertEqual(selection.ptx_architectures, ["100"])

def test_duplicate_ptx_fallbacks_are_deduplicated(self):
with self._mock_supported_architectures(["100", "120"]):
selection = build_utils.select_cuda_architectures_for_nvcc(["103", "103"])

self.assertEqual(selection.architectures, [])
self.assertEqual(selection.ptx_architectures, ["100"])

def test_no_detected_nvcc_architectures_returns_input_unchanged(self):
with self._mock_supported_architectures([]):
selection = build_utils.select_cuda_architectures_for_nvcc(["103"])

self.assertEqual(selection.architectures, ["103"])
self.assertEqual(selection.ptx_architectures, [])

def test_explicit_custom_architectures_are_not_rewritten(self):
with mock.patch.object(
build_utils,
"select_cuda_architectures_for_nvcc",
side_effect=AssertionError("unexpected selector call"),
):
config = {
"CPP_STANDARD": "c++17",
"OPTIMIZE_LEVEL": 3,
"USE_FAST_MATH": False,
"DEBUG_BUILD": False,
"ENABLE_PROFILING": False,
"VERBOSE_BUILD": False,
"CUSTOM_CUDA_ARCHS": ["103"],
}
cuda_info = {"cuda_available": True, "gpu_architectures": ["103"]}

flags = build_utils.get_compile_flags(config, cuda_info)

self.assertIn("-gencode=arch=compute_103,code=sm_103", flags["nvcc"])
self.assertNotIn("-gencode=arch=compute_100,code=compute_100", flags["nvcc"])


if __name__ == "__main__":
unittest.main()
14 changes: 7 additions & 7 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ RUN pip install torch==2.11.0 torchvision==0.26.0 torchaudio==2.11.0 --index-url

RUN pip install black

RUN pip install sphinx \
sphinx-rtd-theme \
sphinx-autodoc-typehints \
sphinx-autobuild \
linkify-it-py \
myst-parser \
sphinxcontrib-spelling
RUN pip install sphinx==8.1.3 \
sphinx-rtd-theme==3.1.0 \
sphinx-autodoc-typehints==3.0.1 \
sphinx-autobuild==2024.10.3 \
linkify-it-py==2.1.0 \
myst-parser==4.0.1 \
sphinxcontrib-spelling==8.0.2

RUN pip install ninja \
scikit-build
Expand Down
13 changes: 8 additions & 5 deletions docs/guides/INSTALLATION_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ ENABLE_PROFILING=1 ./scripts/package_manager.sh install
| `OPTIMIZE_LEVEL` | int: `0`–`3` | `3` | Compiler optimization level |
| `CPP_STANDARD` | string: `c++17` | `c++17` | C++ standard to use |
| `VERBOSE_BUILD` | bool: `0`/`1`, `true`/`false`, `yes`/`no`, `on`/`off` | `0` | Show detailed build output |
| `CUSTOM_CUDA_ARCHS` | list: e.g. `"70,75,80"` or `"75;80;86"` | PyTorch auto-detect, then package default | Target CUDA architectures |
| `CUSTOM_CUDA_ARCHS` | list: e.g. `"70,75,80"` or `"75;80;86"` | PyTorch auto-detect, then package default | Explicit CUDA architecture override |
| `USE_FAST_MATH` | bool: `0`/`1`, `true`/`false`, `yes`/`no`, `on`/`off` | `1` | Enable fast math optimizations |
| `ENABLE_PROFILING` | bool: `0`/`1`, `true`/`false`, `yes`/`no`, `on`/`off` | `0` | Enable profiling support |

Expand All @@ -418,10 +418,13 @@ ENABLE_PROFILING=1 ./scripts/package_manager.sh install
> GPU architectures via CUDA-enabled PyTorch. Missing PyTorch or CPU-only PyTorch is treated as a build
> configuration error.
>
> Auto-detected architectures are capped to the maximum architecture supported by the
> installed `nvcc`, which avoids selecting a GPU architecture that is newer than the CUDA toolkit used for
> the build. When an architecture is capped, the build also includes one PTX target for the newest supported
> forward-compatible base architecture.
> Auto-detected architectures are emitted as real/cubin targets only when the installed `nvcc` exactly supports
> them. If a detected architecture is unsupported, ACCV-Lab emits a supported PTX target below the detected
> architecture, preferring base architectures whose number is divisible by 10 (for example, `100` for a
> detected `103` architecture).
>
> `CUSTOM_CUDA_ARCHS` is an explicit override. When it is set, ACCV-Lab passes those architectures through
> unchanged instead of applying the auto-detection fallback logic.
>
> If PyTorch is CUDA-enabled but no architecture can be detected
> (for example because no CUDA device is visible), ACCV-Lab does not pass `CMAKE_CUDA_ARCHITECTURES`;
Expand Down
12 changes: 6 additions & 6 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
sphinx>=5.0.0
sphinx-rtd-theme>=1.0.0
sphinx-autodoc-typehints>=1.19.0
myst-parser>=0.18.0
linkify-it-py>=2.0.0
sphinx-autobuild
sphinx>=8.1.3
sphinx-rtd-theme>=3.1.0
sphinx-autodoc-typehints>=3.0.1
myst-parser>=4.0.1
linkify-it-py>=2.1.0
sphinx-autobuild>=2024.10.3
1 change: 1 addition & 0 deletions docs/spelling_wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,4 @@ picklable
ABI
aggregator
multimodal
cubin