diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh index 178db42a609a..a0eb0b72df2b 100644 --- a/.ci/aarch64_linux/aarch64_ci_build.sh +++ b/.ci/aarch64_linux/aarch64_ci_build.sh @@ -31,8 +31,7 @@ pip install -r /pytorch/requirements.txt pip install auditwheel==6.2.0 wheel if [ "$DESIRED_CUDA" = "cpu" ]; then echo "BASE_CUDA_VERSION is not set. Building cpu wheel." - #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files - USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn + python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn else echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" export USE_SYSTEM_NCCL=1 @@ -46,6 +45,5 @@ else export USE_NVIDIA_PYPI_LIBS=1 fi - #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files - USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda + python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda fi diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py index 1b6429fa8c06..d4afea81ac0b 100755 --- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py +++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py @@ -317,7 +317,7 @@ def parse_arguments(): ).decode() print("Building PyTorch wheel") - build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "" # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) if enable_cuda: build_vars += "MAX_JOBS=5 " diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt index 0e527f468229..0a30a6037a05 100644 --- a/.ci/docker/ci_commit_pins/executorch.txt +++ b/.ci/docker/ci_commit_pins/executorch.txt @@ -1 +1 @@ -56392aa978594cc155fa8af48cd949f5b5f1823a +e0dda9059d082537cee36be6c5e4fe3b18c880c0 diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh index becd2264e395..fb168acd4feb 100755 --- a/.ci/docker/common/install_executorch.sh +++ b/.ci/docker/common/install_executorch.sh @@ -42,22 +42,27 @@ install_pip_dependencies() { # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current # numba and scipy version used in PyTorch CI conda_run pip uninstall -y numba scipy + # Yaspin is needed for running CI test (get_benchmark_analysis_data.py) + pip_install yaspin==3.1.0 popd } setup_executorch() { - pushd executorch - export PYTHON_EXECUTABLE=python - export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" + export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON" as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true - popd } -clone_executorch -install_buck2 -install_conda_dependencies -install_pip_dependencies -setup_executorch +if [ $# -eq 0 ]; then + clone_executorch + install_buck2 + install_conda_dependencies + install_pip_dependencies + pushd executorch + setup_executorch + popd +else + "$@" +fi diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 7290ff6c8954..ac8f7cde5e24 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -1550,14 +1550,10 @@ test_executorch() { install_torchvision install_torchaudio - pushd /executorch - - export PYTHON_EXECUTABLE=python - export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" + INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh" - # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch - # from the PR - bash .ci/scripts/setup-linux.sh --build-tool cmake + pushd /executorch + "${INSTALL_SCRIPT}" setup_executorch echo "Run ExecuTorch unit tests" pytest -v -n auto @@ -1571,10 +1567,6 @@ test_executorch() { popd - # Test torchgen generated code for Executorch. - echo "Testing ExecuTorch op registration" - "$BUILD_BIN_DIR"/test_edge_op_registration - assert_git_not_dirty } diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat index 01e08c8bb4e5..8720e3981f0a 100644 --- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat +++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat @@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" ( ) else ( set CONDA_PARENT_DIR=C:\Jenkins ) - +set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3 :: Be conservative here when rolling out the new AMI with conda. This will try :: to install conda as before if it couldn't find the conda installation. This :: can be removed eventually after we gain enough confidence in the AMI -if not exist %CONDA_PARENT_DIR%\Miniconda3 ( +if not exist %CONDA_ROOT_DIR% ( set INSTALL_FRESH_CONDA=1 ) @@ -17,10 +17,14 @@ if "%INSTALL_FRESH_CONDA%"=="1" ( if errorlevel 1 exit /b if not errorlevel 0 exit /b - %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3 + %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR% if errorlevel 1 exit /b if not errorlevel 0 exit /b ) :: Activate conda so that we can use its commands, i.e. conda, python, pip -call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3 +call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR% +:: Activate conda so that we can use its commands, i.e. conda, python, pip +call conda activate py_tmp + +call pip install -r requirements.txt diff --git a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat index 4a464d6b5786..3173582b06f4 100644 --- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -14,7 +14,7 @@ if not errorlevel 0 exit /b :: build\torch. Rather than changing all these references, making a copy of torch folder :: from conda to the current workspace is easier. The workspace will be cleaned up after :: the job anyway -xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ +xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ pushd . if "%VC_VERSION%" == "" ( diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh index 43524dc04e3f..2f553084dc55 100755 --- a/.ci/pytorch/win-test.sh +++ b/.ci/pytorch/win-test.sh @@ -38,7 +38,13 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then fi # TODO: Move both of them to Windows AMI -python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 +python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 + +# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments +# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node' +# scipy from 1.6.3 to 1.10 +# expecttest from 0.1.3 to 0.3.0 +python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.0.2" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" # Install Z3 optional dependency for Windows builds. python -m pip install z3-solver==4.15.1.0 @@ -52,9 +58,6 @@ python -m pip install parameterized==0.8.1 # Install pulp for testing ilps under torch\distributed\_tools python -m pip install pulp==2.9.0 -# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308 -python -m pip install expecttest==0.3.0 - run_tests() { # Run nvidia-smi if available for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do diff --git a/.github/actions/reuse-old-whl/reuse_old_whl.py b/.github/actions/reuse-old-whl/reuse_old_whl.py index def0276a9c8a..48a849098594 100644 --- a/.github/actions/reuse-old-whl/reuse_old_whl.py +++ b/.github/actions/reuse-old-whl/reuse_old_whl.py @@ -264,7 +264,7 @@ def change_content_to_new_version(file: Union[str, Path]) -> None: change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py") for file in Path(f"artifacts/dist/{old_stem}").glob( - "*.dist-info/**", + "*.dist-info/*", ): change_content_to_new_version(file) diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml index 93c957896b5e..37cec0c57153 100644 --- a/.github/actions/setup-win/action.yml +++ b/.github/actions/setup-win/action.yml @@ -6,6 +6,12 @@ inputs: cuda-version: description: which cuda version to install, 'cpu' for none required: true + python-version: + required: false + type: string + default: "3.10" + description: | + The python version to be used. Will be 3.10 by default runs: using: composite @@ -38,18 +44,24 @@ runs: CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat" { + echo "CONDA=${CONDA}"; echo "CONDA_RUN=${CONDA} run --no-capture-output"; echo "CONDA_BUILD=${CONDA} run conda-build"; echo "CONDA_INSTALL=${CONDA} install"; } >> "${GITHUB_ENV}" - name: Setup Python3 + env: + PYTHON_VERSION: ${{ inputs.python-version }} shell: bash run: | set +e set -x - PYTHON3=$(${CONDA_RUN} which python3) + # Create new py_tmp env with python-version + ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp + + PYTHON3=$(${CONDA_RUN} -n py_tmp which python3) EXIT_CODE=$? if [[ "${EXIT_CODE}" == "0" ]]; then @@ -62,7 +74,7 @@ runs: # installation, which is Python 3 based. Its Python is default to Python 3. Further, there # is also the Miniconda installation that is Python 2 based, and both can be installed if # needed. In both cases, Python binary is just called python - PYTHON=$(${CONDA_RUN} which python) + PYTHON=$(${CONDA_RUN} -n py_tmp which python) EXIT_CODE=$? if [[ "${EXIT_CODE}" == "0" ]]; then diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml index 7067d79eb075..d447dba4a511 100644 --- a/.github/workflows/_win-build.yml +++ b/.github/workflows/_win-build.yml @@ -151,7 +151,7 @@ jobs: BUILD_WHEEL: 1 MAX_JOBS: 8 CUDA_VERSION: ${{ inputs.cuda-version }} - PYTHON_VERSION: "3.9" + PYTHON_VERSION: "3.10" SCCACHE_BUCKET: "ossci-compiler-cache" SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} SCCACHE_REGION: us-east-1 diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index 5049ef61f693..d48d65d9270d 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -184,7 +184,7 @@ jobs: env: USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }} INSTALL_WINDOWS_SDK: 1 - PYTHON_VERSION: 3.9 + PYTHON_VERSION: "3.10" CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} @@ -217,6 +217,7 @@ jobs: PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} run: | + which python3 pushd "${PYTORCH_FINAL_PACKAGE_DIR}" # shellcheck disable=SC2046,SC2102 python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0 diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 492f41775d9d..e44484e0ddd2 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -71,8 +71,7 @@ jobs: pytorch-linux-jammy-py3-clang12-onnx, pytorch-linux-jammy-linter, pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter, - # Executorch pin needs update - # pytorch-linux-jammy-py3-clang12-executorch, + pytorch-linux-jammy-py3-clang12-executorch, pytorch-linux-jammy-py3.12-triton-cpu, pytorch-linux-noble-riscv64-py3.12-gcc14 ] diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index ff6e9ed10711..290a794ba545 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -318,32 +318,6 @@ jobs: ]} secrets: inherit - linux-jammy-py3-clang12-executorch-build: - if: false # Docker build needs pin update - name: linux-jammy-py3-clang12-executorch - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3-clang12-executorch - docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch - test-matrix: | - { include: [ - { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, - ]} - secrets: inherit - - linux-jammy-py3-clang12-executorch-test: - name: linux-jammy-py3-clang12-executorch - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-py3-clang12-executorch-build - if: false # Has been broken for a while - with: - build-environment: linux-jammy-py3-clang12-executorch - docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} - secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: name: cuda12.8-py3.10-gcc9-sm75 uses: ./.github/workflows/_linux-build.yml diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 5b1a12812003..0140c2d3c00c 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -259,3 +259,27 @@ jobs: docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }} test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }} secrets: inherit + + linux-jammy-py3-clang12-executorch-build: + name: linux-jammy-py3-clang12-executorch + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-py3-clang12-executorch + docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch + test-matrix: | + { include: [ + { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, + ]} + secrets: inherit + + linux-jammy-py3-clang12-executorch-test: + name: linux-jammy-py3-clang12-executorch + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-py3-clang12-executorch-build + with: + build-environment: linux-jammy-py3-clang12-executorch + docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} + secrets: inherit diff --git a/.gitignore b/.gitignore index 2dd40f8cfa85..ca87f1306e12 100644 --- a/.gitignore +++ b/.gitignore @@ -259,6 +259,9 @@ gen .pytest_cache aten/build/* +# Linker scripts for prioritized text optimization +cmake/linker_script.ld + # Bram plsdontbreak diff --git a/.lintrunner.toml b/.lintrunner.toml index 1f79f1eb971d..4511520c5462 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -964,7 +964,6 @@ exclude_patterns = [ 'test/jit/**', # should be run through test/test_jit.py 'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py 'test/fx/**', # should be run through test/test_fx.py - 'test/bottleneck_test/**', # excluded by test/run_test.py 'test/package/**', # excluded by test/run_test.py 'test/distributed/argparse_util_test.py', 'test/distributed/bin/test_script.py', @@ -1410,8 +1409,6 @@ exclude_patterns = [ 'torch/utils/benchmark/utils/timer.py', 'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py', 'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py', - 'torch/utils/bottleneck/__init__.py', - 'torch/utils/bottleneck/__main__.py', 'torch/utils/bundled_inputs.py', 'torch/utils/checkpoint.py', 'torch/utils/collect_env.py', diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fba0eea881b..8323f310fec4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -380,6 +380,13 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" OFF "USE_CUDA" OFF) cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON "CPU_AARCH64" OFF) +# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le. +set(USE_PRIORITIZED_TEXT_DEFAULT OFF) +if(LINUX AND CPU_AARCH64) + set(USE_PRIORITIZED_TEXT_DEFAULT ON) +endif() +cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld." + "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF) option(USE_MIMALLOC "Use mimalloc" OFF) # Enable third party mimalloc library to improve memory allocation performance @@ -657,6 +664,11 @@ endif(MSVC) string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all") +# Set linker max-page-size to 64KiB on AArch64 Linux +if(LINUX AND CPU_AARCH64) + add_link_options_if_supported("-z,max-page-size=0x10000") +endif() + # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not # applicable to mobile are disabled by this variable. Setting # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it @@ -1421,3 +1433,57 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}") endif() + +if(USE_PRIORITIZED_TEXT_FOR_LD) + add_compile_options( + $<$:-ffunction-sections> + $<$:-fdata-sections> + ) + set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld") + set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt") + + add_custom_command( + OUTPUT "${LINKER_SCRIPT_FILE_OUT}" + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}" + DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}" + COMMENT "Generating prioritized text linker files" + VERBATIM + ) + + add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}") + + if(BUILD_PYTHON) + set(LINKER_OPT_TARGETS torch_python) + endif() + + if(NOT BUILD_LIBTORCHLESS) + list(APPEND LINKER_OPT_TARGETS torch_cpu c10) + if(USE_CUDA) + list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda) + endif() + if(USE_XPU) + list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu) + endif() + if(USE_ROCM) + list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip) + endif() + endif() + + foreach(tgt IN LISTS LINKER_OPT_TARGETS) + if(TARGET ${tgt}) + add_dependencies("${tgt}" generate_linker_script) + target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}") + set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}") + else() + message(WARNING "Requested target '${tgt}' for linker script optimization was not found.") + endif() + endforeach() + +else() + if(LINUX AND CPU_AARCH64) + message(WARNING [[ + It is strongly recommend to enable linker script optimization for all AArch64 Linux builds. + To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 + ]]) + endif() +endif() \ No newline at end of file diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 4d48084b0ab8..7a8d02be530e 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) { } bool Context::allowTF32CuDNN(const std::string& op) const { - if (op.size() == 0){ + if (op.empty()){ bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32"; bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32"; TORCH_CHECK( @@ -281,9 +281,6 @@ bool Context::userEnabledOverrideableSDP() const { static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG"; static constexpr const std::array cublas_deterministic_configs = {":4096:8", ":16:8"}; -#ifdef USE_ROCM -static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32"; -#endif bool Context::checkCuBLASConfigDeterministic() { // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config @@ -343,12 +340,6 @@ void Context::setImmediateMiopen(bool b) { } bool Context::allowTF32CuBLAS() const { -#ifdef USE_ROCM - const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32); - if (allow_tf32 != true) { - return false; - } -#endif bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST; bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32"; TORCH_CHECK( @@ -362,14 +353,6 @@ bool Context::allowTF32CuBLAS() const { } void Context::setAllowTF32CuBLAS(bool b) { -#ifdef USE_ROCM - const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32); - if (allow_tf32 != true) { - C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. " - << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it."; - return; - } -#endif float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST; setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee"); } @@ -443,7 +426,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string& std::string msg; auto iterp = _fp32_precisions.find(backend); TORCH_CHECK(iterp != _fp32_precisions.end()); - for (auto p : iterp->second) { + for (const auto& p : iterp->second) { msg += p; msg += " "; } diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp index 7d5e4e84e861..e771c3352767 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.cpp +++ b/aten/src/ATen/FunctionalTensorWrapper.cpp @@ -485,7 +485,10 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptrdevice(); + // The storage pointer already uses the underlying tensor custom device (if + // applicable) to extract the device. So, we dont have to recurse again by + // doing value_.unsafeGetTensorImpl()->device(). + return storage().data_ptr().device(); } at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const { return value_.unsafeGetTensorImpl()->sizes(); diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu index e2eb2226acf4..c1a4f47ccd98 100644 --- a/aten/src/ATen/native/cuda/Shape.cu +++ b/aten/src/ATen/native/cuda/Shape.cu @@ -226,6 +226,38 @@ __global__ void CatArrayBatchedCopy_contig( } } + +template +__global__ void CatArrayBatchedCopy_vectorized( + char* output, + CatArrInputTensorMetadata inputs, + TensorSizeStride os, + const int concatDim, + IndexType trailingSize) { + + IndexType tid = blockIdx.x * blockDim.x + threadIdx.x; + IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec; + + if(tid >= nElements) return; + + const char * data = (char*)inputs.input[blockIdx.y]; + IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec; + IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec; + int64_t dataOffset = (int64_t)offset * alignment; // in bytes + + IndexType stride = gridDim.x * blockDim.x; + + while( tid < nElements){ + int64_t elementOffset = (int64_t)CatArrIndexToOffset::compute( + os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes + auto vec = at::native::memory::ld_vec(data + (int64_t)alignment * tid); + at::native::memory::st_vec(output + dataOffset + elementOffset, vec); + tid += stride; + } +} + + + /* Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads to improve memory bandwidth throughput. @@ -296,12 +328,27 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i scalar_t *data = (scalar_t *)(out.mutable_data_ptr()); CatArrInputTensorMetadata catMetaData; TensorSizeStride outputParam; + // If all batches are contiguous we can call a specialized implementation + // which requires the input tensor addresses to be aligned to a + // 16 Byte boundary. + + constexpr bool isContig = stride_size == 1; + bool isAligned = true; + constexpr int alignment = 16; // Next, let's initialize the size, stride arrays for the output Tensor. + // for contig case, we'll canonicalize output strides, so that + // we don't have arbitrary strides for dims of size 0 + size_t stride0 = 1; if (memory_format == c10::MemoryFormat::Contiguous) { - for (int i = 0; i < nDims; ++i) { + for (int i = nDims - 1; i >= 0; --i) { outputParam.tensorSize[i] = out.size(i); - outputParam.tensorStride[i] = out.stride(i); + if (isContig) { + outputParam.tensorStride[i] = stride0; + stride0 *= out.size(i); + } else { + outputParam.tensorStride[i] = out.stride(i); + } } } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) { // permute the semantics of dims from NCHW to NHWC so that the input @@ -320,12 +367,15 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(); - // If all batches are contiguous we can call a specialized implementation - // which requires the input tensor addresses to be aligned to a - // 16 Byte boundary. - bool isContig = true; - bool isAligned = true; + // for channels last computing slice size correctly is much more involved, so we never send it + // on the fully vectorized path + // we need output stride in cat dimension to be multiple of alignment, + // if we ever use it to compute offsets + // for catting in 0th dimension it doesn't matter + bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment && + memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 || + outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0); unsigned int max_elements_per_tensor = 0; // Now we loop @@ -341,6 +391,16 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i // high-dimensional tensor if (inputs[i+batchCounter].get().numel() > 0) { dimSize = inputs[i+batchCounter].get().size(dimension); + if (isInOutAligned) { + auto t = inputs[i+batchCounter].get(); + // similarly to output stride, we cannot trust stride value to + // determine slice size if the corresponding dimension is 1 + // we have to multiply all the subsequent sizes + int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ? + t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end()); + slice_size *= sizeof(scalar_t); + isInOutAligned &= (slice_size % alignment == 0); + } } catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr()); @@ -351,10 +411,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i #ifdef USE_ROCM // On ROCm, CatArrayBatchedCopy_contig is faster isAligned = false; + isInOutAligned = false; #else // If at least one of the inputs is not aligned, we can't call the // CatArrayBatchedCopy_alignedK_contig isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]); + isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment; #endif if (stride_size > 1) { @@ -365,7 +427,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j]; } catMetaData.isContiguous[batchCounter] = false; - isContig = false; } else { catMetaData.isContiguous[batchCounter] = true; } @@ -388,10 +449,13 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i max_elements_per_tensor, batchCounter); #else dim3 applyBlock, catGrid; - if (isContig && sizeof(scalar_t) > 2) { + if (isInOutAligned) { + std::tie(catGrid, applyBlock) = getCatGridContig( + max_elements_per_tensor, batchCounter); + } else if (isContig && isAligned && sizeof(scalar_t) > 2) { std::tie(catGrid, applyBlock) = getCatGridContig( max_elements_per_tensor, batchCounter); - } else if (isContig && sizeof(scalar_t) == 2) { + } else if (isContig && isAligned && sizeof(scalar_t) == 2) { std::tie(catGrid, applyBlock) = getCatGridContig( max_elements_per_tensor, batchCounter); } else { @@ -399,6 +463,30 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i getCatGrid(batchCounter, catGrid); } #endif + int32_t trailingSize; + TensorSizeStride kernelOutputParam; + if (isInOutAligned) { + // in this case we can and should flatten the tensors after the cat dim + // we want to view the tensors as if consisting of `alignment`-sized elements + // however, we might not be able to cleanly divide just the last dim - + // it might not be the multiple of alignment. + // however, we know that the full concatted slice is multiple of alignment, + // so if we flatten all the dims after and including concat dim, + // it will be divisible by alignment + // then we need to divide last out size by elems_per_vec, + // and divide all strides except last by elems_per_vec (last stride is 1 always) + // for input, we will fix up the sizes and strides in the kernel directly + kernelOutputParam = outputParam; + nDims = dimension + 1; + constexpr auto elems_per_vec = alignment / sizeof(scalar_t); + auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1]; + kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec; + trailingSize = outputParam.tensorStride[dimension]; + kernelOutputParam.tensorStride[dimension] = 1; + for (int i = 0; i < dimension; ++i) { + kernelOutputParam.tensorStride[i] /= elems_per_vec; + } + } if (memory_format != c10::MemoryFormat::Contiguous) { switch (dimension) { @@ -413,7 +501,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i } // Template Declarations for dim = 1, 2, 3, 4 #define HANDLE_CASE(DIMS) \ - if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\ + if (isInOutAligned) {\ + constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \ + CatArrayBatchedCopy_vectorized<<<\ + catGrid, applyBlock, 0, stream.stream()>>>(\ + (char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\ + } else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\ CatArrayBatchedCopy_alignedK_contig<<<\ catGrid, applyBlock, 0, stream.stream()>>>(\ data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\ diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv index bf70642a855e..9199f0cf6c37 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv @@ -205,7 +205,7 @@ llama,pass,0 -llama_v2_7b_16h,model_fail_to_load,0 +llama_v2_7b_16h,pass_due_to_skip,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv index cc4ef192ca53..1e1646d1caf5 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv @@ -178,7 +178,7 @@ llama,fail_to_run,0 -llama_v2_7b_16h,model_fail_to_load,0 +llama_v2_7b_16h,pass_due_to_skip,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv index fed8ebded682..a394375dbfa3 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv @@ -198,7 +198,7 @@ llama,pass,0 -llama_v2_7b_16h,model_fail_to_load,0 +llama_v2_7b_16h,pass_due_to_skip,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv index 08061de428d7..2f09e06da5e3 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv @@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5 YituTechConvBert,pass,5 + + + +meta-llama/Llama-3.2-1B,eager_failed_to_run,0 + + + +google/gemma-2-2b,eager_failed_to_run,0 + + + +google/gemma-3-4b-it,eager_failed_to_run,0 + + + +openai/whisper-tiny,eager_failed_to_run,0 + + + +Qwen/Qwen3-0.6B,eager_failed_to_run,0 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv index 014e23e41cb3..7377ab719b4a 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv @@ -198,7 +198,7 @@ llama,pass,0 -llama_v2_7b_16h,model_fail_to_load,0 +llama_v2_7b_16h,pass_due_to_skip,0 diff --git a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv index fc11be9ba652..2ebde03ffea4 100644 --- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv +++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv @@ -6,7 +6,7 @@ add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1 -add_loop_inductor,compile_time_instruction_count,30280000000,0.1 +add_loop_inductor,compile_time_instruction_count,29660000000,0.1 @@ -50,27 +50,27 @@ symint_sum_loop,compile_time_instruction_count,4299000000,0.1 -aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.1 +aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1869000000,0.1 -aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.1 +aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5281000000,0.1 -aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.1 +aotdispatcher_partitioner_cpu,compile_time_instruction_count,8333000000,0.1 -aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.1 +aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1909000000,0.1 -aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.1 +aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3442000000,0.1 -aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.1 +aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9239000000,0.1 @@ -78,7 +78,7 @@ mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1 -mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8802129167,0.1 +mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,9051000000,0.1 @@ -86,4 +86,4 @@ basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1 -basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.1 +basic_InlineMod_eager,compile_time_instruction_count,7618000000,0.1 diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index fb64e99bccf2..a0bfb22bed80 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -158,6 +158,7 @@ function(caffe2_print_configuration_summary) if(${USE_KLEIDIAI}) message(STATUS " USE_KLEIDIAI : ${USE_KLEIDIAI}") endif() + message(STATUS " USE_PRIORITIZED_TEXT_FOR_LD : ${USE_PRIORITIZED_TEXT_FOR_LD}") message(STATUS " USE_UCC : ${USE_UCC}") if(${USE_UCC}) message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}") diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake index 68e66bb3fc38..c96ffebf858e 100644 --- a/cmake/public/utils.cmake +++ b/cmake/public/utils.cmake @@ -482,6 +482,7 @@ function(torch_update_find_cuda_flags) endfunction() include(CheckCXXCompilerFlag) +include(CheckLinkerFlag) ############################################################################## # CHeck if given flag is supported and append it to provided outputvar @@ -511,3 +512,22 @@ function(target_compile_options_if_supported target flag) target_compile_options(${target} PRIVATE ${flag}) endif() endfunction() + +# Check if a global link option is supported +function(add_link_options_if_supported flag) + check_linker_flag(C "LINKER:${flag}" _supported) + if("${_supported}") + add_link_options("LINKER:${flag}") + else() + message(WARNING "Attempted to use unsupported link option : ${flag}.") + endif() +endfunction() + +function(target_link_options_if_supported tgt flag) + check_linker_flag(C "LINKER:${flag}" _supported) + if("${_supported}") + target_link_options("${tgt}" PRIVATE "LINKER:${flag}") + else() + message(WARNING "Attempted to use unsupported link option : ${flag}.") + endif() +endfunction() \ No newline at end of file diff --git a/docs/source/bottleneck.rst b/docs/source/bottleneck.rst deleted file mode 100644 index ed5caf3fff58..000000000000 --- a/docs/source/bottleneck.rst +++ /dev/null @@ -1,62 +0,0 @@ -torch.utils.bottleneck -====================== - -.. automodule:: torch.utils.bottleneck -.. currentmodule:: torch.utils.bottleneck - -`torch.utils.bottleneck` is a tool that can be used as an initial step for -debugging bottlenecks in your program. It summarizes runs of your script with -the Python profiler and PyTorch's autograd profiler. - -Run it on the command line with - -:: - - python -m torch.utils.bottleneck /path/to/source/script.py [args] - -where [args] are any number of arguments to `script.py`, or run -``python -m torch.utils.bottleneck -h`` for more usage instructions. - -.. warning:: - Because your script will be profiled, please ensure that it exits in a - finite amount of time. - -.. warning:: - Due to the asynchronous nature of CUDA kernels, when running against - CUDA code, the cProfile output and CPU-mode autograd profilers may - not show correct timings: the reported CPU time reports the amount of time - used to launch the kernels but does not include the time the kernel - spent executing on a GPU unless the operation does a synchronize. - Ops that do synchronize appear to be extremely expensive under regular - CPU-mode profilers. - In these case where timings are incorrect, the CUDA-mode autograd profiler - may be helpful. - -.. note:: - To decide which (CPU-only-mode or CUDA-mode) autograd profiler output to - look at, you should first check if your script is CPU-bound - ("CPU total time is much greater than CUDA total time"). - If it is CPU-bound, looking at the results of the CPU-mode autograd - profiler will help. If on the other hand your script spends most of its - time executing on the GPU, then it makes sense to start - looking for responsible CUDA operators in the output of the CUDA-mode - autograd profiler. - - Of course the reality is much more complicated and your script might not be - in one of those two extremes depending on the part of the model you're - evaluating. If the profiler outputs don't help, you could try looking at - the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``. - However, please take into account that the NVTX overhead is very high and - often gives a heavily skewed timeline. Similarly, ``Intel® VTune™ Profiler`` - helps to analyze performance on Intel platforms further with - :func:`torch.autograd.profiler.emit_itt()`. - -.. warning:: - If you are profiling CUDA code, the first profiler that ``bottleneck`` runs - (cProfile) will include the CUDA startup time (CUDA buffer allocation cost) - in its time reporting. This should not matter if your bottlenecks result - in code much slower than the CUDA startup time. - -For more complicated uses of the profilers (like in a multi-GPU case), -please see https://docs.python.org/3/library/profile.html -or :func:`torch.autograd.profiler.profile()` for more information. diff --git a/docs/source/distributed.tensor.md b/docs/source/distributed.tensor.md index cb12eb195c02..3d65399727e1 100644 --- a/docs/source/distributed.tensor.md +++ b/docs/source/distributed.tensor.md @@ -260,3 +260,73 @@ these features. ```{eval-rst} .. py:module:: torch.distributed.tensor.device_mesh ``` + +## Mixed Tensor and DTensor operations + +So you got the following error message. +``` +got mixed torch.Tensor and DTensor, need to convert all +torch.Tensor to DTensor before calling distributed operators! +``` + +There are two cases. + +### Case 1: this is user error + +The most common way to run into this error is to create a regular Tensor +(using a factory function) and then perform a Tensor-DTensor operation, +like the following: + +``` +tensor = torch.arange(10) +return tensor + dtensor +``` + +We disallow mixed Tensor-DTensor operations: if the input to any operations +(e.g. torch.add) is a DTensor, then all Tensor inputs must be DTensors. +This is because the semantics are ambiguous. We don't know if `tensor` is +the same across ranks or if it is different so we ask that the user +figure out how to construct a DTensor with accurate placements from `tensor`. + +If each rank does have the same `tensor`, then please construct a replicated +DTensor: + +``` +tensor = torch.arange(10) +tensor = DTensor.from_local(tensor, placements=(Replicate(),)) +return tensor + dtensor +``` + +If you wanted to create a DTensor with shards, below is how to do it. +Semantically this means that your Tensor data is split between the shards +and that operations act on the "full stacked data". + +``` +tensor = torch.full([], RANK) +tensor = DTensor.from_local(tensor, placements=(Shard(0),)) +return tensor + dtensor +``` + +There are other things you may wish to do with your tensor beyond +these situations (these are not the only two options!). + +## Case 2: the error came from PyTorch framework code + +Sometimes the problem is that PyTorch framework code attempts to perform mixed +Tensor-DTensor operations. These are bugs in PyTorch, please file an issue +so that we can fix them. + +On the user side, the only thing you can do is to avoid using the operation +that caused the issue and file a bug report. + +For PyTorch Developers: one approach of fixing this is to rewrite PyTorch +framework code to avoid mixed Tensor-DTensor code (like in the previous section). + +For PyTorch Developers: the second approach is to turn on DTensor implicit +replication inside the right places in PyTorch framework code. +When on, any mixed Tensor-DTensor operations will assume that the +non-DTensors can be replicated. Please be careful when using this as it +can lead to silent incorrectness. + +- [Turning on implicit replication in Python](https://github.com/pytorch/pytorch/blob/d8e6b2fddc54c748d976e8f0ebe4b63ebe36d85b/torch/distributed/tensor/experimental/__init__.py#L15) +- [Turning on implicit replication in C++](https://github.com/pytorch/pytorch/blob/7a0f93344e2c851b9bcf2b9c3225a323d48fde26/aten/src/ATen/DTensorState.h#L10) diff --git a/docs/source/fx.experimental.md b/docs/source/fx.experimental.md index 24125cd310bc..cba695b5e1c5 100644 --- a/docs/source/fx.experimental.md +++ b/docs/source/fx.experimental.md @@ -8,6 +8,10 @@ These APIs are experimental and subject to change without notice. ::: +```{eval-rst} +.. autoclass:: torch.fx.experimental.sym_node.DynamicInt +``` + ## torch.fx.experimental.symbolic_shapes ```{eval-rst} diff --git a/docs/source/pytorch-api.md b/docs/source/pytorch-api.md index 2e858079d239..3b3f0f627bdd 100644 --- a/docs/source/pytorch-api.md +++ b/docs/source/pytorch-api.md @@ -76,7 +76,6 @@ storage torch.testing torch.utils torch.utils.benchmark -torch.utils.bottleneck torch.utils.checkpoint torch.utils.cpp_extension torch.utils.data diff --git a/setup.py b/setup.py index c0523a1b5c60..2bb63a93cec8 100644 --- a/setup.py +++ b/setup.py @@ -227,9 +227,6 @@ # Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free. # By default, It is only enabled on Windows. # -# USE_PRIORITIZED_TEXT_FOR_LD -# Uses prioritized text form cmake/prioritized_text.txt for LD -# # BUILD_LIBTORCH_WHL # Builds libtorch.so and its dependencies as a wheel # @@ -323,7 +320,6 @@ IS_LINUX, IS_WINDOWS, ) -from tools.setup_helpers.generate_linker_script import gen_linker_script def str2bool(value: str | None) -> bool: @@ -1627,26 +1623,6 @@ def main() -> None: if BUILD_PYTHON_ONLY: install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"] - if str2bool(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD")): - gen_linker_script( - filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld" - ) - linker_script_path = os.path.abspath("cmake/linker_script.ld") - os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}" - os.environ["CFLAGS"] = ( - os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections" - ) - os.environ["CXXFLAGS"] = ( - os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections" - ) - elif platform.system() == "Linux" and platform.processor() == "aarch64": - print_box( - """ - WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA. - To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 - """ - ) - # Parse the command line and check the arguments before we proceed with # building deps and setup. We need to set values so `--help` works. dist = Distribution() diff --git a/test/ao/sparsity/test_activation_sparsifier.py b/test/ao/sparsity/test_activation_sparsifier.py index 8e1525b85879..923ffa16fa02 100644 --- a/test/ao/sparsity/test_activation_sparsifier.py +++ b/test/ao/sparsity/test_activation_sparsifier.py @@ -1,4 +1,4 @@ -# Owner(s): ["module: unknown"] +# Owner(s): ["module: sparse"] import copy diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py index 528fe9b83c65..1725f288cf7c 100644 --- a/test/ao/sparsity/test_composability.py +++ b/test/ao/sparsity/test_composability.py @@ -1,4 +1,4 @@ -# Owner(s): ["module: unknown"] +# Owner(s): ["module: sparse"] import torch diff --git a/test/ao/sparsity/test_data_scheduler.py b/test/ao/sparsity/test_data_scheduler.py index cc4d8ddae63f..de0a885f0153 100644 --- a/test/ao/sparsity/test_data_scheduler.py +++ b/test/ao/sparsity/test_data_scheduler.py @@ -1,4 +1,4 @@ -# Owner(s): ["module: unknown"] +# Owner(s): ["module: sparse"] import copy import warnings diff --git a/test/ao/sparsity/test_data_sparsifier.py b/test/ao/sparsity/test_data_sparsifier.py index 5217049aafdf..c333138769a4 100644 --- a/test/ao/sparsity/test_data_sparsifier.py +++ b/test/ao/sparsity/test_data_sparsifier.py @@ -1,4 +1,4 @@ -# Owner(s): ["module: unknown"] +# Owner(s): ["module: sparse"] import copy import itertools diff --git a/test/ao/sparsity/test_kernels.py b/test/ao/sparsity/test_kernels.py index 1ffdca5fd343..86d8ad4d3a62 100644 --- a/test/ao/sparsity/test_kernels.py +++ b/test/ao/sparsity/test_kernels.py @@ -1,4 +1,4 @@ -# Owner(s): ["module: unknown"] +# Owner(s): ["module: sparse"] import copy import io diff --git a/test/ao/sparsity/test_parametrization.py b/test/ao/sparsity/test_parametrization.py index ac79b6309cf9..95d90725d3c6 100644 --- a/test/ao/sparsity/test_parametrization.py +++ b/test/ao/sparsity/test_parametrization.py @@ -1,4 +1,4 @@ -# Owner(s): ["module: unknown"] +# Owner(s): ["module: sparse"] import torch diff --git a/test/ao/sparsity/test_scheduler.py b/test/ao/sparsity/test_scheduler.py index b563efac73bd..0477b70fd878 100644 --- a/test/ao/sparsity/test_scheduler.py +++ b/test/ao/sparsity/test_scheduler.py @@ -1,4 +1,4 @@ -# Owner(s): ["module: unknown"] +# Owner(s): ["module: sparse"] import warnings diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py index ca80fa7dde7f..86e26e5ca11e 100644 --- a/test/ao/sparsity/test_sparsifier.py +++ b/test/ao/sparsity/test_sparsifier.py @@ -1,4 +1,4 @@ -# Owner(s): ["module: unknown"] +# Owner(s): ["module: sparse"] import itertools import re diff --git a/test/ao/sparsity/test_sparsity_utils.py b/test/ao/sparsity/test_sparsity_utils.py index 45385bca6f6d..f2deaeb1ecc2 100644 --- a/test/ao/sparsity/test_sparsity_utils.py +++ b/test/ao/sparsity/test_sparsity_utils.py @@ -1,4 +1,4 @@ -# Owner(s): ["module: unknown"] +# Owner(s): ["module: sparse"] import logging diff --git a/test/ao/sparsity/test_structured_sparsifier.py b/test/ao/sparsity/test_structured_sparsifier.py index c62cc3d30539..812490452767 100644 --- a/test/ao/sparsity/test_structured_sparsifier.py +++ b/test/ao/sparsity/test_structured_sparsifier.py @@ -1,4 +1,4 @@ -# Owner(s): ["module: unknown"] +# Owner(s): ["module: sparse"] import copy import random diff --git a/test/bottleneck_test/test.py b/test/bottleneck_test/test.py deleted file mode 100644 index 0549a6372ab9..000000000000 --- a/test/bottleneck_test/test.py +++ /dev/null @@ -1,7 +0,0 @@ -# Owner(s): ["module: unknown"] - -import torch - - -x = torch.ones((3, 3), requires_grad=True) -(3 * x).sum().backward() diff --git a/test/bottleneck_test/test_args.py b/test/bottleneck_test/test_args.py deleted file mode 100644 index 38fc03701bf2..000000000000 --- a/test/bottleneck_test/test_args.py +++ /dev/null @@ -1,17 +0,0 @@ -# Owner(s): ["module: unknown"] - -import argparse - -import torch - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - # Required args. Raises error if they aren't passed. - parser.add_argument("--foo", help="foo", required=True) - parser.add_argument("--bar", help="bar", required=True) - _ = parser.parse_args() - - x = torch.ones((3, 3), requires_grad=True) - (3 * x).sum().backward() diff --git a/test/bottleneck_test/test_cuda.py b/test/bottleneck_test/test_cuda.py deleted file mode 100644 index d9f9b0b8274f..000000000000 --- a/test/bottleneck_test/test_cuda.py +++ /dev/null @@ -1,29 +0,0 @@ -# Owner(s): ["module: unknown"] - -import torch -import torch.nn as nn - - -class Model(nn.Module): - def __init__(self) -> None: - super().__init__() - self.linear = nn.Linear(20, 20) - - def forward(self, input): - out = self.linear(input[:, 10:30]) - return out.sum() - - -def main(): - data = torch.randn(10, 50).cuda() - model = Model().cuda() - optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) - for _ in range(10): - optimizer.zero_grad() - loss = model(data) - loss.backward() - optimizer.step() - - -if __name__ == "__main__": - main() diff --git a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py index 800076d3225d..d193d65b179a 100644 --- a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py +++ b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py @@ -117,6 +117,49 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]: for key, value in ref_sharded_sd.items(): self.assertEqual(value, sharded_sd[key]) + @skip_if_lt_x_gpu(2) + def test_cached_state_dict(self): + self.run_subtests( + {"mlp_dim": [2, 3, 4, 5], "mutate_after_state_dict": [True, False]}, + self._test_cached_state_dict, + ) + + def _test_cached_state_dict(self, mlp_dim: int, mutate_after_state_dict: bool): + torch.manual_seed(42) + model = nn.Linear(mlp_dim, mlp_dim, bias=False) + fully_shard(model, reshard_after_forward=True) + optim = torch.optim.AdamW(model.parameters(), lr=1e-2) + + # call .state_dict() once and use `sd` directly to reduce cpu overhead + sd = model.state_dict() + assert isinstance(model.weight, DTensor) + + if not mutate_after_state_dict: + self.assertTrue( + sd["weight"]._local_tensor.untyped_storage().data_ptr() + == model.weight._local_tensor.untyped_storage().data_ptr() + ) + else: + model = model.cpu() + model = model.cuda() + self.assertTrue( + sd["weight"]._local_tensor.untyped_storage().data_ptr() + != model.weight._local_tensor.untyped_storage().data_ptr() + ) + + torch.manual_seed(42 + self.rank) + inp = torch.rand(mlp_dim, mlp_dim, device="cuda") + for _ in range(5): + optim.zero_grad() + loss = model(inp).sum() + loss.backward() + optim.step() + if not mutate_after_state_dict: + self.assertTrue( + sd["weight"]._local_tensor.untyped_storage().data_ptr() + == model.weight._local_tensor.untyped_storage().data_ptr() + ) + @skip_if_lt_x_gpu(2) def test_dp_state_dict_cpu_offload(self): self.run_subtests( diff --git a/test/distributed/tensor/debug/test_debug_mode.py b/test/distributed/tensor/debug/test_debug_mode.py index b5b57169bfe5..f1620e0bc7b2 100644 --- a/test/distributed/tensor/debug/test_debug_mode.py +++ b/test/distributed/tensor/debug/test_debug_mode.py @@ -41,7 +41,7 @@ def test_debug_mode_mm(self): x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False) y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False) - with DebugMode() as debug_mode: + with DebugMode(record_torchfunction=True) as debug_mode: torch.mm(x_dtensor, y_dtensor).sum() self.assertExpectedInline( @@ -80,7 +80,7 @@ def test_debug_mode_backward(self): x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False) y_dtensor = DTensor.from_local(y, mesh, [Shard(1)], run_check=False) - with DebugMode() as debug_mode: + with DebugMode(record_torchfunction=True) as debug_mode: z = x_dtensor + y_dtensor z.sum().backward() @@ -121,7 +121,7 @@ def test_debug_mode_einsum(self): b_dt = DTensor.from_local(b, mesh, [Replicate(), Partial()], run_check=False) # Capture the operator decomposition - with DebugMode() as debug_mode: + with DebugMode(record_torchfunction=True) as debug_mode: torch.einsum("bld,dnh->blnh", a_dt, b_dt) self.assertExpectedInline( @@ -176,7 +176,7 @@ def test_real_tensor(self): x = torch.randn(8, 8, 8) linear = torch.nn.Linear(8, 8) - with DebugMode() as debug_mode: + with DebugMode(record_torchfunction=True) as debug_mode: linear(x).sum() self.assertExpectedInline( @@ -196,7 +196,7 @@ def test_fake_tensor(self): x = torch.randn(8, 8) y = torch.randn(8, 8, 8) - with DebugMode(record_faketensor=True) as debug_mode: + with DebugMode(record_torchfunction=True, record_faketensor=True) as debug_mode: torch.matmul(y, x) self.assertExpectedInline( diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py index 0b48118c2460..cf9420b66e70 100644 --- a/test/distributed/tensor/test_attention.py +++ b/test/distributed/tensor/test_attention.py @@ -9,12 +9,11 @@ import torch import torch.distributed as dist import torch.nn.functional as F -from torch import nn, Tensor +from torch import Tensor from torch.distributed.device_mesh import init_device_mesh from torch.distributed.tensor import DeviceMesh from torch.distributed.tensor.debug import CommDebugMode from torch.distributed.tensor.experimental._attention import ( - _AttentionContextParallel, _CausalBehavior, _cp_options, _DispatchMode, @@ -24,7 +23,6 @@ context_parallel_unshard, set_rotate_method, ) -from torch.distributed.tensor.parallel import parallelize_module from torch.nn.attention import sdpa_kernel, SDPBackend from torch.nn.attention.flex_attention import ( _mask_mod_signature, @@ -42,8 +40,6 @@ from torch.testing._internal.common_utils import run_tests, skipIfRocm from torch.testing._internal.distributed._tensor.common_dtensor import ( DTensorTestBase, - ModelArgs, - Transformer, with_comms, ) @@ -273,180 +269,6 @@ def test_is_causal_behavior(self) -> None: behavior, ) - @skip_if_lt_x_gpu(2) - @unittest.skipIf( - not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention" - ) - @with_comms - def test_ring_attention_native_transformer(self) -> None: - self.run_subtests( - { - "is_causal": [True, False], - "rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL], - }, - self._test_ring_attention_native_transformer, - ) - - @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]) - def _test_ring_attention_native_transformer( - self, is_causal: bool, rotater: _RotateMethod - ) -> None: - _cp_options.enable_load_balance = is_causal - set_rotate_method(rotater_enum_to_str[rotater]) - self.assertEqual(_cp_options.rotate_method, rotater) - device_mesh = DeviceMesh( - self.device_type, - torch.arange(0, self.world_size), - ) - dtype = torch.bfloat16 - bs = 8 - ntokens = 8 - dim = 32 - nheads = 8 - num_layers = 2 - - encoder_layer = nn.TransformerEncoderLayer( - d_model=dim, - nhead=nheads, - dim_feedforward=dim, - batch_first=True, - ).to(dtype) - encoder_layer = parallelize_module( - module=encoder_layer, - device_mesh=device_mesh, - parallelize_plan={ - "self_attn": _AttentionContextParallel(), - }, - ) - model = nn.TransformerEncoder(encoder_layer, num_layers=num_layers) - model = model.to(self.device_type).to(dtype) - - mask = ( - nn.Transformer.generate_square_subsequent_mask( - ntokens, device=self.device_type, dtype=dtype - ) - if is_causal - else None - ) - seq = torch.rand((bs, ntokens, dim), device=self.device_type, dtype=dtype) - - with CommDebugMode() as comm_mode: - out = model(seq, mask=mask, is_causal=is_causal) - - if rotater == _RotateMethod.ALL_TO_ALL: - self.assertDictEqual( - comm_mode.get_comm_counts(), - { - c10d_functional.all_to_all_single: (self.world_size - 1) - * num_layers, - }, - ) - else: - self.assertDictEqual( - comm_mode.get_comm_counts(), - { - c10d_functional.all_gather_into_tensor: num_layers, - }, - ) - - with CommDebugMode() as comm_mode: - out.sum().backward() - - if rotater == _RotateMethod.ALL_TO_ALL: - self.assertDictEqual( - comm_mode.get_comm_counts(), - { - c10d_functional.all_to_all_single: (self.world_size * 2 - 1) - * num_layers, - }, - ) - else: - self.assertDictEqual( - comm_mode.get_comm_counts(), - { - c10d_functional.all_gather_into_tensor: num_layers, - c10d_functional.all_to_all_single: self.world_size * num_layers, - }, - ) - - @skip_if_lt_x_gpu(2) - @unittest.skipIf( - not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention" - ) - @with_comms - @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]) - def test_ring_attention_custom_transformer(self) -> None: - self.run_subtests( - {"rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL]}, - self._test_ring_attention_custom_transformer, - ) - - def _test_ring_attention_custom_transformer(self, rotater: _RotateMethod) -> None: - set_rotate_method(rotater_enum_to_str[rotater]) - self.assertEqual(_cp_options.rotate_method, rotater) - device_mesh = DeviceMesh( - self.device_type, - torch.arange(0, self.world_size), - ) - # early init DTensor RNG tracker to avoid broadcast be captuured in comm_mode - torch.distributed.tensor._random.manual_seed(10, device_mesh) - - dtype = torch.bfloat16 - bs = 2 - args = ModelArgs() - - model = Transformer(args).to(dtype).to(self.device_type) - - model = parallelize_module( - module=model, - device_mesh=device_mesh, - parallelize_plan={ - f"layers.{i}.attention": _AttentionContextParallel() - for i in range(args.n_layers) - }, - ) - - seq = torch.randint( - args.vocab_size, (bs, args.max_seq_len), device=self.device_type - ) - - with CommDebugMode() as comm_mode: - out = model(seq) - - if rotater == _RotateMethod.ALL_TO_ALL: - self.assertDictEqual( - comm_mode.get_comm_counts(), - { - c10d_functional.all_to_all_single: (self.world_size - 1) - * args.n_layers, - }, - ) - else: - self.assertDictEqual( - comm_mode.get_comm_counts(), - {c10d_functional.all_gather_into_tensor: args.n_layers}, - ) - - with CommDebugMode() as comm_mode: - out.sum().backward() - - if rotater == _RotateMethod.ALL_TO_ALL: - self.assertDictEqual( - comm_mode.get_comm_counts(), - { - c10d_functional.all_to_all_single: (self.world_size * 2 - 1) - * args.n_layers, - }, - ) - else: - self.assertDictEqual( - comm_mode.get_comm_counts(), - { - c10d_functional.all_gather_into_tensor: args.n_layers, - c10d_functional.all_to_all_single: self.world_size * args.n_layers, - }, - ) - # Compile the flex_attention function compiled_flex_attention = torch.compile(flex_attention, dynamic=False, fullgraph=True) @@ -532,12 +354,12 @@ def doc_mask_mod(b, h, q_idx, kv_idx): return doc_mask_mod -class RingFlexAttentionTest(DTensorTestBase): +class CPFlexAttentionTest(DTensorTestBase): @property def world_size(self) -> int: return 2 - def _test_ring_flex_attention( + def _test_cp_flex_attention( self, qkv_size, B=1, mask_func=causal_mask, atol=1e-6, rtol=1e-2 ) -> None: torch.cuda.manual_seed(10) @@ -586,15 +408,6 @@ def _test_ring_flex_attention( mesh_shape=(self.world_size,), mesh_dim_names=("cp",), ) - # NOTE: cp needs to know the sharding dimension - # TODO: see if this can be moved to the cp context - from torch.distributed.tensor.experimental._attention import _set_cp_global_var - - _set_cp_global_var("cp_shard_dim", 2) - self.assertEqual( - torch.distributed.tensor.experimental._attention._cp_global_vars.cp_shard_dim, - 2, - ) # NOTE: we do not test load balance here _cp_options.enable_load_balance = False @@ -684,17 +497,17 @@ def _test_ring_flex_attention( @unittest.skipIf( not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention" ) - def test_ring_flex_attention(self) -> None: + def test_cp_flex_attention(self) -> None: self.run_subtests( {"qkv_size": [128 * self.world_size, 2048]}, - self._test_ring_flex_attention, + self._test_cp_flex_attention, ) # NOTE: Context Parallel should not be used for small attentions (block_size < 128) with self.assertRaisesRegex(AssertionError, "Tensor-likes are not close"): self.run_subtests( {"qkv_size": [64 * self.world_size]}, - self._test_ring_flex_attention, + self._test_cp_flex_attention, ) # TODO: merge with the above test @@ -703,7 +516,7 @@ def test_ring_flex_attention(self) -> None: @unittest.skipIf( not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention" ) - def test_ring_flex_attention_document_mask(self) -> None: + def test_cp_flex_attention_document_mask(self) -> None: random.seed(10) # NOTE: Each (batch_size, seq_len) tuple introduces 2 create_block_mask @@ -724,7 +537,7 @@ def test_ring_flex_attention_document_mask(self) -> None: # TODO: change this for-loop to run_subtests # Use a for-loop instead of run_subtests because we need to intialize the mask - # for each subtest. This can be baked into self._test_ring_flex_attention as + # for each subtest. This can be baked into self._test_cp_flex_attention as # a str argument denoting mask type. for batch_size, max_seq_len in itertools.product( batch_size_list, max_seq_len_list @@ -738,7 +551,7 @@ def test_ring_flex_attention_document_mask(self) -> None: # construct testing function test_func = functools.partial( - self._test_ring_flex_attention, + self._test_cp_flex_attention, qkv_size=max_seq_len, B=batch_size, mask_func=document_causal_mask, diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py index 0ea6fa3a2a70..423dda9d43d5 100644 --- a/test/distributed/tensor/test_dtensor_ops.py +++ b/test/distributed/tensor/test_dtensor_ops.py @@ -7,7 +7,7 @@ import torch import torch.distributed as dist import torch.testing._internal.common_methods_invocations as common_ops -from torch.distributed.tensor import DTensor, init_device_mesh +from torch.distributed.tensor import distribute_tensor, DTensor, init_device_mesh, Shard from torch.overrides import resolve_name from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, @@ -21,6 +21,7 @@ ) from torch.utils import _pytree as pytree from torch.utils._pytree import tree_map +from torch.utils.debug_mode import DebugMode # rewrite common size variables to sth can be sharded evenly @@ -117,7 +118,6 @@ def wrapped(fn): xfail("cholesky"), xfail("cholesky_inverse"), xfail("cholesky_solve"), - xfail("chunk"), xfail("combinations"), xfail("complex"), xfail("count_nonzero"), @@ -661,6 +661,36 @@ def test_one_hot(self): sample_inputs_filter=lambda s: s.kwargs["num_classes"] != -1, ) + def test_mean(self): + self.mesh = init_device_mesh(DEVICE_TYPE, (self.world_size,)) + + shape = [2 * self.world_size + 1, 2 * self.world_size] + tensor = ( + torch.arange(shape[0] * shape[1], dtype=torch.float32) + .reshape(shape) + .to(DEVICE_TYPE) + ) + + for is_evenly_shardable in [True]: + if is_evenly_shardable: + placement = [Shard(1)] + reduce_dim = 1 + else: + placement = [Shard(0)] + reduce_dim = 0 + dtensor = distribute_tensor(tensor, self.mesh, placement) + + with DebugMode(record_torchfunction=False) as debug_mode: + mean = dtensor.mean(dim=reduce_dim) + full_tensor = mean.full_tensor() + + self.assertEqual(full_tensor, tensor.mean(dim=reduce_dim)) + + if is_evenly_shardable: + self.assertFalse("redistribute_input" in debug_mode.debug_string()) + else: + self.assertTrue("redistribute_input" in debug_mode.debug_string()) + # only instantiate tests for DEVICE_TYPE alone (i.e. either CPU or GPU) instantiate_device_type_tests(TestDTensorOps, globals(), only_for=(DEVICE_TYPE,)) diff --git a/test/distributed/test_nvshmem_triton.py b/test/distributed/test_nvshmem_triton.py index 58f6a74bacc6..7db4509f162b 100644 --- a/test/distributed/test_nvshmem_triton.py +++ b/test/distributed/test_nvshmem_triton.py @@ -14,7 +14,6 @@ instantiate_parametrized_tests, parametrize, run_tests, - skip_but_pass_in_sandcastle, skip_but_pass_in_sandcastle_if, skipIfRocm, ) @@ -64,22 +63,20 @@ def nvshmem_get_kernel( @triton.jit def nvshmem_putmem_signal_block_kernel( - dst_ptr, - src_ptr, + dst, + src, size_bytes, - sig_ptr, - signal_val, + signal, + sig_val, sig_op, peer, ): - nvshmem.putmem_signal_block( - dst_ptr, src_ptr, size_bytes, sig_ptr, signal_val, sig_op, peer - ) + nvshmem.putmem_signal_block(dst, src, size_bytes, signal, sig_val, sig_op, peer) @triton.jit -def nvshmem_signal_wait_until_kernel(sig_ptr, cmp_op, cmp_val): - nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val) +def nvshmem_signal_wait_until_kernel(signal, cmp_op, cmp_val): + nvshmem.signal_wait_until(signal, cmp_op, cmp_val) @triton.jit @@ -399,7 +396,6 @@ def test_triton_get_ring(self) -> None: out, expected_value * torch.ones(numel, dtype=dtype, device=self.device) ) - @skip_but_pass_in_sandcastle("Hangs") @skipIfRocm @requires_triton() @requires_h100() @@ -421,7 +417,7 @@ def test_triton_put_signal_set(self) -> None: val = 11 inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val) out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1) - inp_hdl = symm_mem.rendezvous(inp, group=group_name) + symm_mem.rendezvous(inp, group=group_name) out_hdl = symm_mem.rendezvous(out, group=group_name) # Use the signal pad attached to the output symmetric memory handle @@ -435,15 +431,12 @@ def test_triton_put_signal_set(self) -> None: if rank == 0: # Rank 0 puts into Rank 1 - dst_ptr = out_hdl.buffer_ptrs[peer] - src_ptr = inp_hdl.buffer_ptrs[rank] - sig_ptr = out_hdl.signal_pad_ptrs[peer] nvshmem_putmem_signal_block_kernel[(1, 1, 1)]( - dst_ptr, - src_ptr, + out, + inp, size_bytes=msg_size_bytes, - sig_ptr=sig_ptr, - signal_val=SIGNAL_VAL, + signal=flag, + sig_val=SIGNAL_VAL, sig_op=NVSHMEM_SIGNAL_SET, peer=peer, extern_libs=nvshmem_lib, @@ -451,9 +444,8 @@ def test_triton_put_signal_set(self) -> None: if rank == 1: # Wait until signal flag is set by Rank 0 - sig_ptr_local = out_hdl.signal_pad_ptrs[rank] nvshmem_signal_wait_until_kernel[(1,)]( - sig_ptr_local, + flag, cmp_op=NVSHMEM_CMP_EQ, cmp_val=SIGNAL_VAL, extern_libs=nvshmem_lib, @@ -466,7 +458,6 @@ def test_triton_put_signal_set(self) -> None: flag, torch.tensor([SIGNAL_VAL], dtype=torch.int64, device=self.device) ) - @skip_but_pass_in_sandcastle("Hangs") @skipIfRocm @requires_triton() @requires_h100() @@ -488,7 +479,7 @@ def test_triton_put_signal_add(self) -> None: val = 11 inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val) out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1) - inp_hdl = symm_mem.rendezvous(inp, group=group_name) + symm_mem.rendezvous(inp, group=group_name) out_hdl = symm_mem.rendezvous(out, group=group_name) # Use the signal pad attached to the output symmetric memory handle @@ -502,24 +493,20 @@ def test_triton_put_signal_add(self) -> None: if rank == 0: # Rank 0 puts into Rank 1 - dst_ptr = out_hdl.buffer_ptrs[peer] - src_ptr = inp_hdl.buffer_ptrs[rank] - sig_ptr = out_hdl.signal_pad_ptrs[peer] nvshmem_putmem_signal_block_kernel[(1, 1, 1)]( - dst_ptr, - src_ptr, + out, + inp, size_bytes=msg_size_bytes, - sig_ptr=sig_ptr, - signal_val=SIGNAL_VAL, + signal=flag, + sig_val=SIGNAL_VAL, sig_op=NVSHMEM_SIGNAL_ADD, peer=peer, extern_libs=nvshmem_lib, ) if rank == 1: - sig_ptr_local = out_hdl.signal_pad_ptrs[rank] nvshmem_signal_wait_until_kernel[(1, 1, 1)]( - sig_ptr_local, + flag, cmp_op=NVSHMEM_CMP_EQ, cmp_val=SIGNAL_VAL, extern_libs=nvshmem_lib, @@ -585,7 +572,6 @@ def test_triton_wait_until(self) -> None: extern_libs=nvshmem_lib, ) - @skip_but_pass_in_sandcastle("Hangs") @skipIfRocm @requires_triton() @requires_h100() @@ -612,7 +598,7 @@ def test_triton_signal_wait_until(self) -> None: # Producer (rank 0) prepares the data to send inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val_to_put) - inp_hdl = symm_mem.rendezvous(inp, group=group_name) + symm_mem.rendezvous(inp, group=group_name) # Consumer (rank 1) prepares the destination buffer out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1) out_hdl = symm_mem.rendezvous(out, group=group_name) @@ -622,24 +608,20 @@ def test_triton_signal_wait_until(self) -> None: if rank == 0: # Producer (rank 0): Puts data into rank 1's `out` buffer and then sets the flag - dst_ptr = out_hdl.buffer_ptrs[peer] - src_ptr = inp_hdl.buffer_ptrs[rank] - sig_ptr = out_hdl.signal_pad_ptrs[peer] nvshmem_putmem_signal_block_kernel[(1, 1, 1)]( - dst_ptr, - src_ptr, + out, + inp, size_bytes=msg_size_bytes, - sig_ptr=sig_ptr, - signal_val=COMPLETION_FLAG_VAL, + signal=flag, + sig_val=COMPLETION_FLAG_VAL, sig_op=NVSHMEM_SIGNAL_SET, peer=peer, extern_libs=nvshmem_lib, ) elif rank == 1: # Consumer (rank 1): Waits on the signal variable using `signal_wait_until`. - sig_ptr = out_hdl.signal_pad_ptrs[rank] nvshmem_signal_wait_until_kernel[(1, 1, 1)]( - sig_ptr, + flag, cmp_op=NVSHMEM_CMP_EQ, cmp_val=COMPLETION_FLAG_VAL, extern_libs=nvshmem_lib, diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py index 081ceb5065df..172ced2a58a0 100644 --- a/test/dynamo/test_error_messages.py +++ b/test/dynamo/test_error_messages.py @@ -48,27 +48,6 @@ def __exit__(self, exc_type, exc_value, traceback): class ErrorMessagesTest(LoggingTestCase): - def test_dynamic_shape_operator(self): - def fn(): - return torch.nonzero(torch.rand([10, 10])) - - self.assertExpectedInlineMunged( - Unsupported, - lambda: torch.compile(fn, backend="eager", fullgraph=True)(), - """\ -Dynamic shape operator - Explanation: Operator `aten.nonzero.default`'s output shape depends on input Tensor data. - Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True` - - Developer debug context: aten.nonzero.default - - For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0036.html - -from user code: - File "test_error_messages.py", line N, in fn - return torch.nonzero(torch.rand([10, 10]))""", - ) - def test_dynamic_shape_operator_no_meta_kernel(self): def fn(): return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10)) @@ -91,29 +70,6 @@ def fn(): return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))""", ) - def test_data_dependent_operator(self): - def fn(x): - return x.item() - - self.assertExpectedInlineMunged( - Unsupported, - lambda: torch.compile(fn, backend="eager", fullgraph=True)( - torch.Tensor([1]) - ), - """\ -Unsupported Tensor.item() call with capture_scalar_outputs=False - Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False. - Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph. - - Developer debug context: call_method TensorVariable() item () {} - - For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html - -from user code: - File "test_error_messages.py", line N, in fn - return x.item()""", - ) - def test_data_dependent_operator2(self): def fn(x): return torch.equal(x, x) diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py index 09666e1da397..cc1d0829948d 100644 --- a/test/dynamo/test_functions.py +++ b/test/dynamo/test_functions.py @@ -2087,6 +2087,12 @@ def test_namedtuple_user_methods(a, b): mytuple = FunctionTests.MyNamedTuple(a, b) return mytuple.add(), mytuple.static_method(), mytuple.class_method() + @make_test + def test_namedtuple_replace(a, b): + mytuple = FunctionTests.MyNamedTuple(a, b) + replaced = mytuple._replace(first=b) + return mytuple.first + mytuple.second + replaced.first + replaced.second + @make_test def test_generic_namedtuple_user_methods(a, b): mytuple = FunctionTests.MyGenericNamedTuple(a, b) diff --git a/test/dynamo/test_graph_region_tracker.py b/test/dynamo/test_graph_region_tracker.py index e930ff787a9a..ce456596fd55 100644 --- a/test/dynamo/test_graph_region_tracker.py +++ b/test/dynamo/test_graph_region_tracker.py @@ -1,6 +1,5 @@ # Owner(s): ["module: dynamo"] import contextlib -import os import torch import torch.fx @@ -196,21 +195,6 @@ def fn(x, y, z): ) def test_mismatched_global_state(self): - @contextlib.contextmanager - def _hip_allow_tf32(): - # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new - # and only for MI300+ - hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None) - os.environ["HIPBLASLT_ALLOW_TF32"] = "1" - - try: - yield - finally: - if hip_allow_tf32 is not None: - os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32 - else: - del os.environ["HIPBLASLT_ALLOW_TF32"] - def inner_fn(x, y): x1 = x * 1 y1 = y + 1 @@ -251,31 +235,29 @@ def set_default_dtype_bfloat16(): def reset_default_dtype(): torch.set_default_dtype(old_dtype) - tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext - with tf32_ctx(): - for ctx in [ - lambda: torch.set_grad_enabled(False), - torch.autograd.grad_mode.inference_mode, - lambda: torch.autograd.graph.disable_saved_tensors_hooks( - "This is not supported" - ), - # lambda: torch.set_num_threads(2), : Unsupported - (set_default_dtype_bfloat16, reset_default_dtype), - ( - lambda: torch.use_deterministic_algorithms(True), - lambda: torch.use_deterministic_algorithms(False), - ), - # (lambda: torch.use_deterministic_algorithms(True, warn_only=True), - # lambda: torch.use_deterministic_algorithms(False)), : Unsupported - create_toggle_fns("allow_bf16_reduced_precision_reduction"), - create_toggle_fns("allow_fp16_reduced_precision_reduction"), - create_toggle_fns("allow_tf32"), - ]: - self.assertExpectedInline( - self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx), - """[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \ + for ctx in [ + lambda: torch.set_grad_enabled(False), + torch.autograd.grad_mode.inference_mode, + lambda: torch.autograd.graph.disable_saved_tensors_hooks( + "This is not supported" + ), + # lambda: torch.set_num_threads(2), : Unsupported + (set_default_dtype_bfloat16, reset_default_dtype), + ( + lambda: torch.use_deterministic_algorithms(True), + lambda: torch.use_deterministic_algorithms(False), + ), + # (lambda: torch.use_deterministic_algorithms(True, warn_only=True), + # lambda: torch.use_deterministic_algorithms(False)), : Unsupported + create_toggle_fns("allow_bf16_reduced_precision_reduction"), + create_toggle_fns("allow_fp16_reduced_precision_reduction"), + create_toggle_fns("allow_tf32"), + ]: + self.assertExpectedInline( + self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx), + """[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \ [['x1', 'y1', 'sum_1', 'o4'], ['x1_1', 'y1_1', 'sum_2', 'o5']]]""", - ) + ) def test_mutation_tracking_simple(self): def fn(x, y, z): diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py index 85831321f09a..648848420a1d 100644 --- a/test/dynamo/test_misc.py +++ b/test/dynamo/test_misc.py @@ -8478,43 +8478,24 @@ def write_state(state): def fn(x): return x + 1 - import contextlib - - @contextlib.contextmanager - def _hip_allow_tf32(): - # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new - # and only for MI300+ - hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None) - os.environ["HIPBLASLT_ALLOW_TF32"] = "1" - - try: - yield - finally: - if hip_allow_tf32 is not None: - os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32 - else: - del os.environ["HIPBLASLT_ALLOW_TF32"] - - tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext - with tf32_ctx(): - initial_state = read_state() - y = torch.randn(10) - try: - for round in range(3): - for i in range(len(initial_state)): - new_state = [False] * len(initial_state) - new_state[i] = True - write_state(new_state) - assert read_state() == new_state - last_state.clear() - fn(y) - assert last_state == new_state - if round == 0: - assert cnt == i + 1 - else: - assert cnt == len(initial_state) - finally: - write_state(initial_state) + initial_state = read_state() + y = torch.randn(10) + try: + for round in range(3): + for i in range(len(initial_state)): + new_state = [False] * len(initial_state) + new_state[i] = True + write_state(new_state) + assert read_state() == new_state + last_state.clear() + fn(y) + assert last_state == new_state + if round == 0: + assert cnt == i + 1 + else: + assert cnt == len(initial_state) + finally: + write_state(initial_state) def test_grad_state_mutated(self): prior = torch.is_grad_enabled() @@ -13373,6 +13354,26 @@ def f(actions, n_act, epsilon=0.1): y = torch.tensor(5) f(x, y) + def test_full_graph_capture_scalar_outputs(self): + @torch.compile(fullgraph=True) + def foo(a): + return torch.randn(5) * a.item() + + # We expect to no longer raise here + foo(torch.tensor(2.0)) + + def test_full_graph_capture_dynamic_output_shape_ops(self): + def fn(x): + nz = torch.nonzero(x) + squared = nz * nz + sliced = torch.ops.aten.slice.Tensor(squared, dim=1, start=-2, end=None) + view = sliced.unsqueeze(dim=0) + return view.squeeze(dim=0) + + example_inputs = (torch.randn(1, 1, 1, 1),) + # we expect to no longer raise here + torch.compile(fn, fullgraph=True)(*example_inputs) + def test_dynamic_float_scalar_tensor_coersion(self): # Minified version of https://github.com/pytorch/pytorch/issues/158376#issuecomment-3079591367 class Foo: diff --git a/test/dynamo/test_precompile_context.py b/test/dynamo/test_precompile_context.py index b509adf28112..01c71968f160 100644 --- a/test/dynamo/test_precompile_context.py +++ b/test/dynamo/test_precompile_context.py @@ -47,7 +47,8 @@ def simple_function(x): x = torch.randn(10, device=GPU_TYPE, requires_grad=True) result = compiled_fn(x) result.sum().backward() - self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2) + self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1) + self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1) self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0) result = PrecompileContext.serialize() @@ -82,8 +83,9 @@ def simple_function(x): x = torch.randn(10, device=GPU_TYPE, requires_grad=True) result = compiled_fn(x) result.sum().backward() - self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2) - for key in PrecompileContext._new_cache_artifacts_by_key.keys(): + self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1) + self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1) + for key in PrecompileContext._backend_artifacts_by_key.keys(): result = PrecompileContext.serialize_artifact_by_key(key) assert isinstance(result, PrecompileCacheArtifact) self.assertEqual(result.key, key) @@ -109,11 +111,12 @@ def simple_function(x): x = torch.randn(10, device=GPU_TYPE, requires_grad=True) result = compiled_fn(x) result.sum().backward() - self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2) + self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1) + self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1) # Find the key for the artifact of type "precompile_aot_autograd" key = next( k - for k, v in PrecompileContext._new_cache_artifacts_by_key.items() + for k, v in PrecompileContext._backend_artifacts_by_key.items() if isinstance(v, EditablePrecompileCacheArtifact) ) diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py index 89c14961a3a7..ce4f97ad3c6a 100644 --- a/test/dynamo/test_structured_trace.py +++ b/test/dynamo/test_structured_trace.py @@ -109,6 +109,8 @@ def format(self, record): metadata["dynamo_start"]["stack"] = "STACK" if "inductor_output_code" in metadata: metadata["inductor_output_code"]["filename"] = "FILENAME" + if "file_path" in metadata["inductor_output_code"]: + metadata["inductor_output_code"]["file_path"] = "FILENAME" if "stack" in metadata: metadata["stack"] = "STACK" if "compilation_metrics" in metadata: @@ -259,7 +261,7 @@ def test_schedule(self): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} @@ -293,7 +295,7 @@ def test_cudagraphs(self): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} @@ -335,7 +337,7 @@ def fn(x, y): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0} @@ -357,7 +359,7 @@ def fn(x, y): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"} {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"} {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0} @@ -389,7 +391,7 @@ def test_example_fn(self): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0} @@ -446,7 +448,7 @@ def test_example_training_fn(self): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} {"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} {"compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1} @@ -455,7 +457,7 @@ def test_example_training_fn(self): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"} {"bwd_compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1} {"dynamo_start": {"stack": "STACK"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} @@ -598,7 +600,7 @@ def forward(self, x): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} @@ -611,7 +613,7 @@ def forward(self, x): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} @@ -679,7 +681,7 @@ def forward(self, x): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"describe_storage": {"id": 16, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} {"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} @@ -698,7 +700,7 @@ def forward(self, x): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0} @@ -739,7 +741,7 @@ def fn(x): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 0} @@ -900,7 +902,7 @@ def fn(a): {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0} @@ -915,7 +917,7 @@ def fn(a): {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} -{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} +{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} {"artifact": {"name": "inductor_provenance_tracking_node_mappings", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} {"artifact": {"name": "inductor_provenance_tracking_kernel_stack_traces", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} {"artifact": {"name": "fx_graph_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"} diff --git a/test/export/test_export.py b/test/export/test_export.py index 7f03666b807f..2ffd3dbcf399 100755 --- a/test/export/test_export.py +++ b/test/export/test_export.py @@ -1458,6 +1458,40 @@ def forward(self, x, y): ep = export(f, args, strict=False) self.assertEqual(ep.module()(*args), f(*args)) + def test_where_decomp(self): + class TestModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch.ops.aten.where.default(x > 0) + + test_module = TestModule() + sample_input = (torch.randn(2, 3),) + + def auto_dynamic_shapes_from_args(args): # pyre-ignore + """ + This function creates dynamic shapes specification with Dim.AUTO + in all dimensions of all tensors for given argument list. + """ + if isinstance(args, list): + return [auto_dynamic_shapes_from_args(arg) for arg in args] + elif isinstance(args, tuple): + return tuple(auto_dynamic_shapes_from_args(arg) for arg in args) + elif isinstance(args, dict): + return {k: auto_dynamic_shapes_from_args(v) for k, v in args.items()} + elif isinstance(args, torch.Tensor): + return {j: Dim.AUTO for j in range(args.dim())} + else: + print(f"args type: {type(args)}") + return None + + ep = torch.export.export( + test_module, + sample_input, + dynamic_shapes=auto_dynamic_shapes_from_args(sample_input), + ).run_decompositions({}) + def test_basic_non_strict_fake_tensor(self): class Basic(torch.nn.Module): def __init__(self) -> None: diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py index 61658692612b..e103aeb97c91 100644 --- a/test/functorch/test_control_flow.py +++ b/test/functorch/test_control_flow.py @@ -2007,7 +2007,6 @@ def test_scan_complex_pytree(self, reverse, compile_mode, device, autograd): # Fails with: AssertionError: scan is not an OpOverload @unittest.skipIf(not SM70OrLater, "triton") @requires_cuda - @unittest.expectedFailure def test_scan_associative_scan(self): combine_mode = "generic" compile_mode_scan = "compile" diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py index 82dc013963f3..7bf1dc7d36ca 100644 --- a/test/inductor/test_aot_inductor.py +++ b/test/inductor/test_aot_inductor.py @@ -50,6 +50,7 @@ ) from torch.testing._internal.common_device_type import ( _has_sufficient_memory, + e4m3_type, skipCUDAIf, ) from torch.testing._internal.common_quantization import ( @@ -1194,7 +1195,6 @@ def forward(self, x, y): not PLATFORM_SUPPORTS_FP8, "FP8 is only supported on H100+, SM 8.9 and MI300+ devices", ) - @skipIfRocm # _scaled_mm_out_cuda is not compiled for ROCm platform @skipIfXpu def test_fp8(self): # cuda only @@ -1207,7 +1207,7 @@ def __init__(self, dtype): self.out_dtype = dtype def forward(self, x, weight, bias, scale_a, scale_b): - weight = weight.to(torch.float8_e4m3fn) + weight = weight.to(e4m3_type) output = torch._scaled_mm( x, weight, @@ -1229,7 +1229,7 @@ def forward(self, x, weight, bias, scale_a, scale_b): b_inverse_scale = 1 / b_scale x_shape = (16, 16) - x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(torch.float8_e4m3fn) + x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(e4m3_type) dim0_x = Dim("dim0_x", min=1, max=2048) dynamic_shapes = ({0: dim0_x}, None, None, None, None) self.check_model( @@ -1242,7 +1242,6 @@ def forward(self, x, weight, bias, scale_a, scale_b): not PLATFORM_SUPPORTS_FP8, "FP8 is only supported on H100+, SM 8.9 and MI300+ devices", ) - @skipIfRocm # _scaled_mm_out_cuda is not compiled for ROCm platform @skipIfXpu def test_fp8_view_of_param(self): # cuda only @@ -1277,15 +1276,13 @@ def forward(self, x, bias, scale_a, scale_b): input_bias = torch.rand(32, device=self.device, dtype=dtype) weight_shape = (32, 16) weight = torch.rand(*weight_shape, device=self.device, dtype=dtype).to( - torch.float8_e4m3fn + e4m3_type ) a_inverse_scale = 1 / a_scale b_inverse_scale = 1 / b_scale x_shape = (16, 16) - x = torch.rand(*x_shape, device=self.device, dtype=dtype).to( - torch.float8_e4m3fn - ) + x = torch.rand(*x_shape, device=self.device, dtype=dtype).to(e4m3_type) dim0_x = Dim("dim0_x", min=1, max=2048) dynamic_shapes = ({0: dim0_x}, None, None, None) self.check_model( @@ -5195,7 +5192,6 @@ def forward(self, x): not PLATFORM_SUPPORTS_FP8, "FP8 is only supported on H100+, SM 8.9 and MI300+ devices", ) - @skipIfRocm # _scaled_mm_out_cuda is not compiled for ROCm platform @skipIfXpu def test_aoti_debug_printer_fp8_dtype(self): if self.device != GPU_TYPE: @@ -5207,7 +5203,7 @@ def __init__(self, dtype): self.out_dtype = dtype def forward(self, x, weight, bias, scale_a, scale_b): - weight = weight.to(torch.float8_e4m3fn) + weight = weight.to(e4m3_type) output = torch._scaled_mm( x, weight, @@ -5229,7 +5225,7 @@ def forward(self, x, weight, bias, scale_a, scale_b): b_inverse_scale = 1 / b_scale x_shape = (16, 16) - x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(torch.float8_e4m3fn) + x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(e4m3_type) kernel_calls = [ (f"aoti_torch_{GPU_TYPE}__scaled_mm_out", 5), diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py index 6da49ab39229..09570b98a2fb 100644 --- a/test/inductor/test_codecache.py +++ b/test/inductor/test_codecache.py @@ -7,6 +7,7 @@ import subprocess import sys import tempfile +import textwrap import unittest from contextlib import contextmanager from typing import Optional, Union @@ -56,6 +57,7 @@ from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, IS_FBCODE, + IS_SANDCASTLE, parametrize, TEST_WITH_ROCM, ) @@ -138,6 +140,101 @@ def test_linemaps_empty(self): stack_frames = PyCodeCache.stack_frames_for_code(path, 0) self.assertEqual(stack_frames, None) + @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Skip in fbcode/sandcastle") + def test_editable_cached_wrapper(self): + with tempfile.TemporaryDirectory() as tmpdir: + env = os.environ.copy() + env["TORCHINDUCTOR_CACHE_DIR"] = tmpdir + + step1 = textwrap.dedent( + """ + import glob + import os + import torch + import warnings + from torch._inductor import config + + warnings.filterwarnings("ignore") + config.fx_graph_cache = True + config.fx_graph_remote_cache = False + torch._dynamo.reset() + + @torch.compile(backend="inductor") + def f(x): + return x * 2 + + f(torch.ones(2)) + cache_dir = os.environ["TORCHINDUCTOR_CACHE_DIR"] + pyfiles = glob.glob(os.path.join(cache_dir, "**", "*.py"), recursive=True) + print(pyfiles[0]) + """ + ) + wrapper_path = ( + subprocess.check_output([sys.executable, "-c", step1], env=env) + .decode() + .strip() + ) + + step2 = textwrap.dedent( + """ + import torch + import warnings + from torch._dynamo.utils import counters + from torch._inductor import config + + warnings.filterwarnings("ignore") + config.fx_graph_cache = True + config.fx_graph_remote_cache = False + torch._dynamo.reset() + + @torch.compile(backend="inductor") + def f(x): + return x * 2 + + f(torch.ones(2)) + print(counters["inductor"]["fxgraph_cache_hit"]) + """ + ) + hit = ( + subprocess.check_output([sys.executable, "-c", step2], env=env) + .decode() + .strip() + ) + self.assertEqual(hit, "1") + + with open(wrapper_path) as f: + src = f.read() + with open(wrapper_path, "w") as f: + f.write( + src.replace( + "def call(self, args):", + "def call(self, args):\n print('debug')", + ) + ) + + step3 = textwrap.dedent( + """ + import torch + import warnings + from torch._inductor import config + + warnings.filterwarnings("ignore") + config.fx_graph_cache = True + config.fx_graph_remote_cache = False + torch._dynamo.reset() + + @torch.compile(backend="inductor") + def f(x): + return x * 2 + + f(torch.ones(2)) + """ + ) + out = subprocess.check_output( + [sys.executable, "-c", step3], env=env + ).decode() + self.assertIn("debug", out) + @instantiate_parametrized_tests class TestFxGraphCache(TestCase): diff --git a/test/inductor/test_cuda_select_algorithm.py b/test/inductor/test_cuda_select_algorithm.py index 271532d25ee0..1fc40c42ba19 100644 --- a/test/inductor/test_cuda_select_algorithm.py +++ b/test/inductor/test_cuda_select_algorithm.py @@ -124,6 +124,79 @@ def forward(self, x, scale): self.common(mod, (x, w_scales)) self.assertEqual(counters["inductor"]["woq_matcher_count"], 1) + @inductor_config.patch({"freezing": True, "cpp.enable_concat_linear": True}) + @patches + @torch.no_grad + @dtypes(torch.bfloat16) + @parametrize("batch_size", (1, 32)) + @parametrize("mid_dim", (1, 8)) + @parametrize("in_features", (128,)) + @parametrize("out_features", (64,)) + @unittest.skipIf(not TEST_CUDA, "CUDA not available") + def test_int8_woq_mm_concat_cuda( + self, dtype, batch_size, mid_dim, in_features, out_features + ): + def _convert_weight_to_int8pack(w): + # Move to CPU for quantization calculation, then back to original device + device = w.device + w_cpu = w.cpu() + scale, zp = _calculate_dynamic_per_channel_qparams( + w_cpu.to(torch.float), torch.int8 + ) + scale = torch.from_numpy(scale).to(device) + zp = torch.from_numpy(zp).to(device) + w_int8 = torch.ao.quantization.fx._decomposed.quantize_per_channel( + input=w, + scales=scale, + zero_points=zp, + axis=0, + quant_min=-128, + quant_max=127, + dtype=torch.int8, + ) + return w_int8, scale.to(torch.bfloat16) + + class M(torch.nn.Module): + def __init__(self, w1, w2, w3): + super().__init__() + self.w1 = torch.nn.Parameter(w1, requires_grad=False) + self.w2 = torch.nn.Parameter(w2, requires_grad=False) + self.w3 = torch.nn.Parameter(w3, requires_grad=False) + + def forward(self, x, scale1, scale2, scale3): + # Ref: _linear_fp_act_int8_weight_impl in torchao/dtypes/uintx/plain_layout.py + y1 = ( + torch.mm(x.reshape(-1, x.shape[-1]), self.w1.t().to(x.dtype)) + * scale1 + ) + y2 = ( + torch.mm(x.reshape(-1, x.shape[-1]), self.w2.t().to(x.dtype)) + * scale2 + ) + y3 = ( + torch.mm(x.reshape(-1, x.shape[-1]), self.w3.t().to(x.dtype)) + * scale3 + ) + return ( + y1.reshape(*x.shape[:-1], y1.shape[-1]), + y2.reshape(*x.shape[:-1], y2.shape[-1]), + y3.reshape(*x.shape[:-1], y3.shape[-1]), + ) + + counters.clear() + # Currently, the corresponding torch.fx pattern only supports 3D x + # Add 2D X case once the corresponding pattern-matcher pattern is added + x = torch.rand((batch_size, mid_dim, in_features), dtype=dtype, device="cuda") + w1 = torch.rand((out_features, in_features), dtype=dtype, device="cuda") + w2 = torch.rand((out_features, in_features), dtype=dtype, device="cuda") + w3 = torch.rand((out_features, in_features), dtype=dtype, device="cuda") + w1_int8pack, w1_scales = _convert_weight_to_int8pack(w1) + w2_int8pack, w2_scales = _convert_weight_to_int8pack(w2) + w3_int8pack, w3_scales = _convert_weight_to_int8pack(w3) + mod = M(w1_int8pack, w2_int8pack, w3_int8pack).eval() + self.common(mod, (x, w1_scales, w2_scales, w3_scales)) + self.assertEqual(counters["inductor"]["woq_matcher_count"], 3) + instantiate_device_type_tests(TestSelectAlgorithmCuda, globals(), only_for="cuda") diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py index d0618886660a..149e32a2433a 100644 --- a/test/inductor/test_cutlass_backend.py +++ b/test/inductor/test_cutlass_backend.py @@ -257,7 +257,7 @@ def test_import_cutlass(self): if config.is_fbcode(): import python_cutlass else: - import cutlass as python_cutlass # noqa: F401 + import cutlass_cppgen as python_cutlass # noqa: F401 import cutlass_library # noqa: F401 def test_cutlass_key(self): diff --git a/test/inductor/test_cutlass_evt.py b/test/inductor/test_cutlass_evt.py index cae9558d2ec2..66f03762fa1b 100644 --- a/test/inductor/test_cutlass_evt.py +++ b/test/inductor/test_cutlass_evt.py @@ -36,7 +36,7 @@ if config.is_fbcode(): import python_cutlass # type: ignore[import-untyped, import-not-found] # noqa: F401 else: - import cutlass as python_cutlass # type: ignore[import-untyped, import-not-found] # noqa: F401 + import cutlass_cppgen as python_cutlass # type: ignore[import-untyped, import-not-found] # noqa: F401 CutlassTensor = python_cutlass.backend.evt.ir.tensor.Tensor BIAS_CODE = """def example_epilogue(accum, C, aux, bias): diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py index 120d8d36b439..849aefff8a96 100644 --- a/test/inductor/test_flex_decoding.py +++ b/test/inductor/test_flex_decoding.py @@ -43,9 +43,6 @@ Tolerances = namedtuple("Tolerances", ["atol", "rtol"]) -# In MI300, HIPBLASLT_ALLOW_TF32=1 is used to enable tf32 for matmul. -# In the current test, HIPBLASLT_ALLOW_TF32 is not set, according to the -# logic of allowTF32CuBLAS(), set float32_matmul_precision to highest. if torch.version.hip: torch.set_float32_matmul_precision("highest") else: diff --git a/test/inductor/test_memory.py b/test/inductor/test_memory.py index f905990478f7..bf994b5e6b84 100644 --- a/test/inductor/test_memory.py +++ b/test/inductor/test_memory.py @@ -408,13 +408,9 @@ def can_fuse( code = run_and_get_triton_code(f_compiled, x, y, z) ( FileCheck() - .check("triton_poi_fused_add_0.run(buf1, arg2_1,") - .check("triton_poi_fused_add_0.run(buf3, arg2_1,") - .check("triton_poi_fused_add_0.run(buf4, buf3,") - .check("triton_poi_fused_add_0.run(buf6, arg2_1,") - .check("triton_poi_fused_add_0.run(buf7, buf6,") - .check("triton_poi_fused_add_0.run(buf9, arg2_1,") - .check("triton_poi_fused_add_0.run(buf10, buf9,") + .check("triton_poi_fused_add_0.run(buf2, arg2_1, buf1,") + .check("triton_poi_fused_add_1.run(buf4, buf3, arg2_1") + .check("triton_poi_fused_add_1.run(buf6, buf5, arg2_1,") .run(code) ) diff --git a/test/inductor/test_padding.py b/test/inductor/test_padding.py index 9ef3a18e2423..c67bde87a369 100644 --- a/test/inductor/test_padding.py +++ b/test/inductor/test_padding.py @@ -109,9 +109,6 @@ def setUpClass(cls): if HAS_GPU: cls.prior_float32_matmul_precision = torch.get_float32_matmul_precision() cls.prior_default_device = torch.get_default_device() - # In MI300, HIPBLASLT_ALLOW_TF32=1 is used to enable tf32 for matmul. - # In the current test, HIPBLASLT_ALLOW_TF32 is not set, according to the - # logic of allowTF32CuBLAS(), set float32_matmul_precision to highest. if torch.version.hip: torch.set_float32_matmul_precision("highest") else: diff --git a/test/inductor/test_unbacked_symints.py b/test/inductor/test_unbacked_symints.py index cc9c1251523d..b41886a03dd9 100644 --- a/test/inductor/test_unbacked_symints.py +++ b/test/inductor/test_unbacked_symints.py @@ -489,6 +489,22 @@ def fn(q, k, vector, scalar): expected = fn(*example_inputs) torch.testing.assert_close(actual, expected) + @skipGPUIf(not HAS_GPU, "requires gpu and triton") + @dynamo_config.patch({"capture_dynamic_output_shape_ops": True}) + def test_softmax(self, device): + def fn(x): + nz = x.nonzero().float() + soft = torch.softmax(nz, dim=0) + logsoft = torch.nn.functional.log_softmax(nz, dim=0) + return soft * logsoft + + example_inputs = ( + torch.randint(low=0, high=2, size=(32,), device=device, dtype=torch.int8), + ) + actual = torch.compile(fn, fullgraph=True)(*example_inputs) + expected = fn(*example_inputs) + torch.testing.assert_close(actual, expected) + @skipGPUIf(not HAS_GPU, "requires gpu and triton") @skipIfXpu(msg="_scaled_dot_product_flash_attention is not supported on XPU yet") @dynamo_config.patch({"capture_dynamic_output_shape_ops": True}) diff --git a/test/inductor/test_utils.py b/test/inductor/test_utils.py index 0fb1a8dcf322..9300bf1bae8c 100644 --- a/test/inductor/test_utils.py +++ b/test/inductor/test_utils.py @@ -131,7 +131,7 @@ def create_fx_node( ( torch.ops.aten.convolution, ( - torch.Tensor(2, 3, 3), + torch.Tensor(2, 2, 3), torch.Tensor(2, 2, 2), torch.Tensor(2), (1, 1), diff --git a/test/test_ao_sparsity.py b/test/test_ao_sparsity.py index 5ae5a0874318..35b96522a81c 100644 --- a/test/test_ao_sparsity.py +++ b/test/test_ao_sparsity.py @@ -1,4 +1,4 @@ -# Owner(s): ["module: unknown"] +# Owner(s): ["module: sparse"] import logging # Kernels diff --git a/test/test_cuda.py b/test/test_cuda.py index 7bd310042862..b809fc521600 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -759,53 +759,7 @@ def check_workspace_size(inp): torch._C._cuda_clearCublasWorkspaces() - @contextlib.contextmanager - def _hip_allow_tf32(self): - # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new - # and only for MI300+ - hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None) - os.environ["HIPBLASLT_ALLOW_TF32"] = "1" - - try: - yield - finally: - if hip_allow_tf32 is not None: - os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32 - else: - del os.environ["HIPBLASLT_ALLOW_TF32"] - - @unittest.skipIf(not TEST_WITH_ROCM, "not relevant for CUDA testing") - def test_hipblaslt_allow_tf32(self): - tf32_ctx = self._hip_allow_tf32 - with tf32_ctx(): - os.environ["HIPBLASLT_ALLOW_TF32"] = "0" - # Save original value of allow_tf32 - orig = torch.backends.cuda.matmul.allow_tf32 - # If allow_tf32 variable is declared as static in aten/src/ATen/Context.cpp - # then matmul.allow_tf32 will return False after this point even if - # HIP_BLASLT_ALLOW_TF32 is set to 1 and matmul.allow_tf32 is changed. - os.environ["HIPBLASLT_ALLOW_TF32"] = "1" - # Toggle torch.backends.cuda.matmul.allow_tf32 couple of times. - torch.backends.cuda.matmul.allow_tf32 = not orig - test1 = torch.backends.cuda.matmul.allow_tf32 - torch.backends.cuda.matmul.allow_tf32 = orig - test2 = torch.backends.cuda.matmul.allow_tf32 - self.assertNotEqual(test1, test2) - # Restore original value of allow_tf32 - torch.backends.cuda.matmul.allow_tf32 = orig - def test_cublas_allow_tf32_get_set(self): - """ - We only turn on TF32 for MI300 with a special env var. This is because TF32 - is only available in MI300+ and is in experimental mode (hipblaslt support - is current WIP) - """ - tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext - - with tf32_ctx(): - self._test_cublas_allow_tf32_get_set_inner() - - def _test_cublas_allow_tf32_get_set_inner(self): skip_tf32_cublas = "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE" in os.environ and int( os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"] ) @@ -820,12 +774,6 @@ def _test_cublas_allow_tf32_get_set_inner(self): torch.backends.cuda.matmul.allow_tf32 = orig def test_float32_matmul_precision_get_set(self): - tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext - - with tf32_ctx(): - self._test_float32_matmul_precision_get_set_inner() - - def _test_float32_matmul_precision_get_set_inner(self): orig = torch.get_float32_matmul_precision() skip_tf32_cublas = "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE" in os.environ and int( os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"] diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py index 0e90587822d6..3fee860a7980 100644 --- a/test/test_dynamic_shapes.py +++ b/test/test_dynamic_shapes.py @@ -1818,6 +1818,96 @@ def test_stride_symnode(self): self.assertTrue(isinstance(s3, int)) self.assertTrue(str(s1.node.expr) != str(s2.node.expr)) + @fresh_cache() + @torch._dynamo.config.patch("capture_scalar_outputs", True) + @parametrize("backend", ["inductor", "eager"]) + def test_dynamic_int_basic_compile(self, backend): + from torch.fx.experimental.sym_node import DynamicInt + + cnt = CompileCounterWithBackend(backend) + + # test scalar inputs to function + def f(x, y, z): + out = torch.tensor([x + y + z]) + out = out + torch.zeros(abs(x) + 2).sum() # test out tensor construction + return out + + fn = torch.compile(f, fullgraph=True, backend=cnt) + x = DynamicInt(1) + z = DynamicInt(3) + self.assertEqual(fn(x, x, z), f(1, 1, 3)) # guard: x == y + self.assertEqual(fn(2, 2, 0), f(2, 2, 0)) + self.assertEqual(fn(-1, -1, 2), f(-1, -1, 2)) + self.assertEqual(cnt.frame_count, 1) # no recompiles + + self.assertEqual(fn(3, 4, 5), f(3, 4, 5)) # now we recompile + self.assertEqual(cnt.frame_count, 2) + + # test nn module property + class Foo(torch.nn.Module): + def __init__(self): + super().__init__() + self.i = DynamicInt(1) + + def forward(self, x): + return torch.tensor([x + self.i]) + + cnt.clear() + m = Foo() + mc = torch.compile(m, backend=cnt, fullgraph=True) + + self.assertEqual(mc(DynamicInt(0)), m(0)) + mc.i = -2 # override attribute + self.assertEqual(mc(-1), m(-1)) + self.assertEqual(cnt.frame_count, 1) + + def test_dynamic_int_eager_usage(self): + from torch.fx.experimental.sym_node import DynamicInt + + w = DynamicInt(-1) + x = DynamicInt(0) + y = DynamicInt(1) + z = DynamicInt(2) + + def check(l, r): + self.assertTrue(isinstance(l, DynamicInt)) + self.assertEqual(l, r) + + # test arithmetic + check(2 * y + z, 4) + check((10 - z) // 2, 4) + check(1 // z, 0) + check(-w + w**2, 2) + check(x % z, 0) + check(1 << z, 4) + check(z | y, 3) + check(min(y, z), 1) + self.assertTrue(z > -2) + with self.assertRaises(ZeroDivisionError): + y % x + + # math, numpy + self.assertEqual(math.cos(x), y) + self.assertEqual(math.prod([z, z], start=z), 8) + self.assertEqual(np.arange(z)[y], 1) + self.assertTrue(np.allclose(np.ones([y, z]).sum(axis=x), np.ones(z))) + + # test conversions + self.assertTrue(isinstance(x + 2, int)) + self.assertTrue(isinstance(x + 2, DynamicInt)) + self.assertEqual(y / 2.0, 0.5) # this could return DynamicFloat in future + self.assertEqual(float(z), 2.0) + self.assertFalse(bool(x)) + self.assertEqual(DynamicInt(x).real, x.real) + + # torch functions, scalar inputs + self.assertEqual(torch.arange(z)[:w][x], 0) + self.assertEqual(torch.add(torch.tensor(w), torch.tensor(w), alpha=z), -3) + self.assertEqual( + list(torch.nn.Linear(z, y)(torch.randn(z * 2, z)).shape), [4, 1] + ) + self.assertEqual(z * torch.ones(z).sum(dim=x), 4) + instantiate_parametrized_tests(TestSymNumberMagicMethods) diff --git a/test/test_linalg.py b/test/test_linalg.py index ffae8ac18da2..4f8780dfc30a 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -109,22 +109,6 @@ def get_tunableop_untuned_filename(): return untuned_filename class TestLinalg(TestCase): - @contextlib.contextmanager - def _hip_allow_tf32(self): - # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new - # and only for MI300+. Environment variable will be removed in the future. - import os - hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None) - os.environ["HIPBLASLT_ALLOW_TF32"] = "1" - - try: - yield - finally: - if hip_allow_tf32 is not None: - os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32 - else: - del os.environ["HIPBLASLT_ALLOW_TF32"] - def setUp(self): super().setUp() torch.backends.cuda.matmul.allow_tf32 = False @@ -5542,13 +5526,8 @@ def test_scaled_gemm_tunableop(self, device, dtype): @runOnRocmArch(MI300_ARCH) @dtypes(torch.float) def test_tf32_tunableop(self, device, dtype): - # Test TunableOp with TF32. Supported by hipblasLT on MI300+. - # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new - # and only for MI300+. Eventually this flag will go away. - tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext - try: - with self._tunableop_ctx(), tf32_ctx(): + with self._tunableop_ctx(): torch.backends.cuda.matmul.allow_tf32 = True torch.cuda.tunable.set_rotating_buffer_size(0) @@ -5611,13 +5590,8 @@ def test_tf32_offline_tunableop(self, device, dtype): # This test is the offline version of test_tf32_tunableop import os - # Test TunableOp with TF32. Supported by hipblasLT on MI300+. - # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new - # and only for MI300+. Eventually this flag will go away. - tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext - try: - with self._tunableop_ctx(), tf32_ctx(): + with self._tunableop_ctx(): torch.backends.cuda.matmul.allow_tf32 = True ordinal = torch.cuda.current_device() torch.cuda.tunable.set_rotating_buffer_size(0) diff --git a/test/test_ops.py b/test/test_ops.py index 2d5af9966690..64b657c9294b 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -190,7 +190,6 @@ def reduction_dtype_filter(op): xfail("tril"), xfail("triu"), xfail("unfold_copy"), - xfail("where"), # Output has dynamic shape. # Does not have a meta kernel implementation. skip("linalg.lstsq"), diff --git a/test/test_sparse_semi_structured.py b/test/test_sparse_semi_structured.py index 51fb4aa48c22..ce1d62f48398 100644 --- a/test/test_sparse_semi_structured.py +++ b/test/test_sparse_semi_structured.py @@ -50,8 +50,8 @@ _IS_HIPSPARSELT_AVAILABLE = False if torch.cuda.is_available(): - _IS_SM8X = torch.cuda.get_device_capability(0)[0] == 8 - _IS_SM9X = torch.cuda.get_device_capability(0)[0] == 9 + _IS_SM8X = torch.version.cuda is not None and (torch.cuda.get_device_capability(0)[0] == 8) + _IS_SM9X = torch.version.cuda is not None and (torch.cuda.get_device_capability(0)[0] == 9) _IS_HIPSPARSELT_AVAILABLE = torch.version.hip is not None and tuple(int(v) for v in torch.version.hip.split('.')[:2]) > (6, 4) # CUTLASS kernels only work for Ampere if _IS_SM8X: diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index 15c04b8154c3..03a7e381332a 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -1151,6 +1151,50 @@ def test_cat2(self, device, dtype): z = torch.cat([x, y]) self.assertEqual(z.size(), (21, SIZE, SIZE)) + @dtypes(torch.float) + def test_cat_size1(self, device, dtype): + # create a tensor that has aligned stride along dim - 1 dimension + # but catted slice size is not aligned + x1 = torch.randn(16, 16, device=device, dtype=dtype)[:1, :1] + xref = x1.clone().view(-1).view(x1.shape) + # make sure output size is aligned, need at least 4 elements for this + res = torch.cat([x1, x1, x1, x1], dim=-1) + ref = torch.cat([xref, xref, xref, xref], dim=-1) + self.assertEqual(res, ref) + + @dtypes(torch.float) + def test_cat_trailing_dim(self, device, dtype): + x1 = torch.randn(16, 16, 23, device=device, dtype=dtype) + x2 = torch.rand_like(x1) + res = torch.cat([x1, x2], dim=1) + ref = torch.cat([x1.cpu(), x2.cpu()], dim=1) + self.assertEqual(res, ref) + + @dtypes(torch.float) + def test_cat_misaligned(self, device, dtype): + x1 = torch.randn(14, device=device, dtype=dtype)[2:] + x2 = torch.rand_like(x1) + res = torch.cat([x1, x2], dim=-1) + ref = torch.cat([x1.cpu(), x2.cpu()], dim=-1) + self.assertEqual(res, ref) + + @dtypes(torch.float) + def test_cat_multi_batch(self, device, dtype): + xs = [torch.randn(16, 16, device=device, dtype=dtype) for _ in range(130)] + xs_cpu = [x.cpu() for x in xs] + res = torch.cat(xs, dim=-1) + ref = torch.cat(xs_cpu, dim=-1) + self.assertEqual(res, ref) + + @dtypes(torch.float) + @largeTensorTest("16GB") + def test_cat_large_tensor(self, device, dtype): + N = 2 ** 32 // dtype.itemsize + inps = [torch.randn(N, device=device, dtype=dtype), torch.randn(N // 128, device=device, dtype=dtype)] + res = torch.cat(inps, dim=0) + ref = torch.cat([x.cpu() for x in inps]) + self.assertEqual(res, ref) + # FIXME: Create an OpInfo-based tensor creation method test that verifies this for all tensor # creation methods and verify all dtypes and layouts @dtypes(torch.bool, torch.uint8, torch.int16, torch.int64, torch.float16, torch.float32, torch.complex64) diff --git a/test/test_transformers.py b/test/test_transformers.py index c58fe05d37be..7c2060034710 100644 --- a/test/test_transformers.py +++ b/test/test_transformers.py @@ -51,7 +51,6 @@ PLATFORM_SUPPORTS_CUDNN_ATTENTION, tf32_on_and_off, tf32_enabled, - ROCM_VERSION, ) if TEST_FAIRSEQ: @@ -340,7 +339,7 @@ def test_train_with_pad_and_catch_error(self, device): l1_bool = nn.L1Loss()(test_train_bool[:, 0:2, :], test_eval_bool[:, 0:2, :]).item() self.assertTrue(l1_bool < 1e-4, "Eval/Train difference in pad_mask BOOL") - @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0))) + @tf32_on_and_off(0.001) @parametrize("attn_mask_dim", [2, 3, None]) @parametrize("key_padding_mask_dim", [2, None]) @parametrize("mask_dtype", [torch.bool, torch.float32]) @@ -524,7 +523,7 @@ def test_transformerencoder_fastpath(self, device, use_torchscript, enable_neste slowpath_output = slowpath_output.masked_fill(src_key_padding_mask.unsqueeze(-1), 0) self.assertEqual(fastpath_output_expanded, slowpath_output) - @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0))) + @tf32_on_and_off(0.001) @parametrize("with_no_grad", [True, False]) @parametrize("training", [True, False]) @parametrize("enable_nested_tensor", [False]) @@ -1110,7 +1109,7 @@ def forward( return_all_hiddens=False, )[0] - @tf32_on_and_off(0.003, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0))) + @tf32_on_and_off(0.003) @parametrize("input_dim,attn_mask_dim,is_causal", [(3, None, False), (3, 2, False), (3, 2, True), (3, 3, False), (3, 3, True), (4, None, False), (4, 2, False), (4, 2, True), (4, 4, False), (4, 4, True)], diff --git a/test/test_utils.py b/test/test_utils.py index 0314da6e320a..7c9e4c1d334f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -3,7 +3,6 @@ import os import random -import re import shutil import subprocess import sys @@ -633,151 +632,6 @@ def test_multi_drop(self): test_dir = os.path.abspath(os.path.dirname(str(__file__))) -@unittest.skipIf( - "SKIP_TEST_BOTTLENECK" in os.environ.keys(), "SKIP_TEST_BOTTLENECK is set" -) -class TestBottleneck(TestCase): - def _run(self, command, timeout=30): - """Returns (return-code, stdout, stderr)""" - import subprocess - - p = subprocess.Popen( - command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True, - ) - try: - output, err = p.communicate(timeout=timeout) - except subprocess.TimeoutExpired: - p.kill() - output, err = p.communicate() - rc = p.returncode - output_str = output.decode("ascii") - err_str = err.decode("ascii") - return (rc, output_str, err_str) - - def _run_bottleneck(self, test_file, scriptargs=""): - curdir = os.path.dirname(os.path.abspath(__file__)) - filepath = f"{curdir}/{test_file}" - if scriptargs != "": - scriptargs = f" {scriptargs}" - rc, out, err = self._run( - f"{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}" - ) - return rc, out, err - - def _check_run_args(self): - # Check that this fails due to missing args - rc, out, err = self._run_bottleneck("bottleneck_test/test_args.py") - self.assertEqual( - rc, - 2, - atol=0, - rtol=0, - msg=self._fail_msg("Missing args should error", out + err), - ) - - # This should succeed - rc, out, err = self._run_bottleneck( - "bottleneck_test/test_args.py", "--foo foo --bar bar" - ) - self.assertEqual( - rc, - 0, - atol=0, - rtol=0, - msg=self._fail_msg("Should pass args to script", out + err), - ) - - def _fail_msg(self, msg, output): - return f"{msg}, output was:\n{output}" - - def _check_environment_summary(self, output): - results = re.search("Environment Summary", output) - self.assertIsNotNone( - results, self._fail_msg("Should have Environment Summary", output) - ) - - # Up to five lines away from the heading, there should be the version number - results = re.search( - r"Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+", output - ) - self.assertIsNotNone( - results, self._fail_msg("Should have PyTorch version", output) - ) - - def _check_cprof_summary(self, output): - results = re.search("cProfile output", output) - self.assertIsNotNone( - results, self._fail_msg("Should have cProfile output", output) - ) - - # This assumes that after the cProfile output section we have - # the autograd profiler output - results = re.search( - r"cProfile output.*(\n.*){6,50}\n.*autograd profiler output", output - ) - self.assertIsNotNone( - results, - self._fail_msg( - "Distance between cProfile and autograd prof out not in [6, 50] lines", - output, - ), - ) - - def _check_autograd_summary(self, output): - results = re.search("autograd profiler output", output) - self.assertIsNotNone( - results, self._fail_msg("Should have autograd profiler output", output) - ) - - # This assumes that after the autograd profiler output is the end of the - # output. - results = re.search(r"autograd profiler output.*(\n.*){6,100}", output) - self.assertIsNotNone( - results, - self._fail_msg( - "Distance between autograd prof output and end of output not in [6, 100] lines", - output, - ), - ) - - def _check_cuda(self, output): - if HAS_CUDA: - results = re.search("CUDA mode", output) - self.assertIsNotNone( - results, self._fail_msg("Should tell users CUDA", output) - ) - else: - results = re.search("CUDA mode", output) - self.assertIsNone( - results, self._fail_msg("Should not tell users about CUDA", output) - ) - - @unittest.skipIf(HAS_CUDA, "CPU-only test") - def test_bottleneck_cpu_only(self): - rc, out, err = self._run_bottleneck("bottleneck_test/test.py") - self.assertEqual(rc, 0, msg=f"Run failed with\n{err}") - - self._check_run_args() - self._check_environment_summary(out) - self._check_autograd_summary(out) - self._check_cprof_summary(out) - self._check_cuda(out) - - @unittest.skipIf(not HAS_CUDA, "No CUDA") - def test_bottleneck_cuda(self): - rc, out, err = self._run_bottleneck("bottleneck_test/test_cuda.py") - self.assertEqual(rc, 0, msg=f"Run failed with\n{err}") - - self._check_run_args() - self._check_environment_summary(out) - self._check_autograd_summary(out) - self._check_cprof_summary(out) - self._check_cuda(out) - - from torch.utils.collect_env import get_pretty_env_info diff --git a/third_party/cutlass b/third_party/cutlass index e51efbfe18fe..57e3cfb47a2d 160000 --- a/third_party/cutlass +++ b/third_party/cutlass @@ -1 +1 @@ -Subproject commit e51efbfe18fe4f4cbb66ab814c55bf4aa0185491 +Subproject commit 57e3cfb47a2d9e0d46eb6335c3dc411498efa198 diff --git a/third_party/fbgemm b/third_party/fbgemm index 4b39c551efe1..f9ccd0126207 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit 4b39c551efe15e6bbade20565b0ceb2d8ce3352d +Subproject commit f9ccd0126207c149bf99877c5e863d8018208858 diff --git a/third_party/xpu.txt b/third_party/xpu.txt index ed84e6812d9b..f8e8503faf2c 100644 --- a/third_party/xpu.txt +++ b/third_party/xpu.txt @@ -1 +1 @@ -d8c3eefc297193cf9e0888a7d8ff32dc74da0793 +24fab67b6ecf7620d0cf776047a3056c5b518bab diff --git a/tools/setup_helpers/generate_linker_script.py b/tools/setup_helpers/generate_linker_script.py index e66fc197062a..b5a7a4ce7dec 100644 --- a/tools/setup_helpers/generate_linker_script.py +++ b/tools/setup_helpers/generate_linker_script.py @@ -1,5 +1,7 @@ +import argparse import os import subprocess +from pathlib import Path def gen_linker_script( @@ -28,6 +30,10 @@ def gen_linker_script( assert len(text_line_start) == 1, "The linker script has multiple text sections!" text_line_start = text_line_start[0] + # ensure that parent directory exists before writing + fout = Path(fout) + fout.parent.mkdir(parents=True, exist_ok=True) + with open(fout, "w") as f: for lineid, line in enumerate(linker_script_lines): if lineid == text_line_start + 2: @@ -36,3 +42,20 @@ def gen_linker_script( f.write(f" .text.{plines}\n") f.write(" )\n") f.write(f"{line}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate linker file based on prioritized symbols. Used for link-time optimization.", + ) + parser.add_argument( + "--filein", + help="Path to prioritized_text.txt input file", + default=argparse.SUPPRESS, + ) + parser.add_argument( + "--fout", help="Output path for linker ld file", default=argparse.SUPPRESS + ) + # convert args to a dict to pass to gen_linker_script + kwargs = vars(parser.parse_args()) + gen_linker_script(**kwargs) diff --git a/tools/testing/discover_tests.py b/tools/testing/discover_tests.py index 25fcf07de937..13511b1ec129 100644 --- a/tools/testing/discover_tests.py +++ b/tools/testing/discover_tests.py @@ -73,7 +73,6 @@ def skip_test_p(name: str) -> bool: cpp_tests_dir=CPP_TESTS_DIR, blocklisted_patterns=[ "ao", - "bottleneck_test", "custom_backend", "custom_operator", "fx", # executed by test_fx.py diff --git a/torch/__init__.py b/torch/__init__.py index eac57306e63d..08dee0624350 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -244,7 +244,7 @@ def _load_dll_libraries() -> None: textwrap.dedent( """ Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure. - It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe + It can be downloaded at https://aka.ms/vs/17/release/vc_redist.x64.exe """ ).strip() ) @@ -2507,7 +2507,8 @@ def compile( fullgraph (bool): If False (default), torch.compile attempts to discover compilable regions in the function that it will optimize. If True, then we require that the entire function be capturable into a single graph. If this is not possible (that is, if there are graph breaks), - then this will raise an error. + then this will raise an error. This also opts into unbacked semantics, notably it will turn on + capture_scalar_outputs and capture_dynamic_output_shape_ops on by default. dynamic (bool or None): Use dynamic shape tracing. When this is True, we will up-front attempt to generate a kernel that is as dynamic as possible to avoid recompilations when sizes change. This may not always work as some operations/optimizations will diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py index 2a00c57419da..637db6192b14 100644 --- a/torch/_decomp/decompositions.py +++ b/torch/_decomp/decompositions.py @@ -1173,6 +1173,8 @@ def native_dropout(input: Tensor, p: float, train: Optional[bool]): @register_decomposition(aten._softmax) @out_wrapper() def _softmax(x: Tensor, dim: int, half_to_float: bool): + from torch.fx.experimental.symbolic_shapes import guard_or_false + # eager softmax returns a contiguous tensor. Ensure that decomp also returns # a contiguous tensor. x = x.contiguous() @@ -1182,7 +1184,7 @@ def _softmax(x: Tensor, dim: int, half_to_float: bool): x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT ) x = x.to(computation_dtype) - if x.numel() == 0: + if guard_or_false(x.numel() == 0): unnormalized = torch.exp(x) else: x_max = torch.amax(x, dim, keepdim=True) @@ -1196,6 +1198,8 @@ def _softmax(x: Tensor, dim: int, half_to_float: bool): @register_decomposition(aten._log_softmax) @out_wrapper(exact_dtype=True) def _log_softmax(x: Tensor, dim: int, half_to_float: bool): + from torch.fx.experimental.symbolic_shapes import guard_or_false + # eager log_softmax returns a contiguous tensor. Ensure that decomp also # returns a contiguous tensor. x = x.contiguous() @@ -1205,7 +1209,7 @@ def _log_softmax(x: Tensor, dim: int, half_to_float: bool): x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT ) x = x.to(computation_dtype) - if x.numel() == 0: + if guard_or_false(x.numel() == 0): shifted = x else: x_max = torch.amax(x, dim, keepdim=True) diff --git a/torch/_dynamo/aot_compile.py b/torch/_dynamo/aot_compile.py index 9a1dce544052..3ce7c3a72e2a 100644 --- a/torch/_dynamo/aot_compile.py +++ b/torch/_dynamo/aot_compile.py @@ -13,7 +13,7 @@ import torch import torch.fx from torch._dynamo.graph_utils import _graph_device_type -from torch._dynamo.precompile_context import SystemInfo +from torch._dynamo.package import SystemInfo from . import convert_frame from .hooks import Hooks diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py index bf5ba4be4973..1e23e4e4e4e4 100644 --- a/torch/_dynamo/guards.py +++ b/torch/_dynamo/guards.py @@ -136,6 +136,7 @@ DefaultsSource, DictGetItemSource, DictSubclassGetItemSource, + DynamicScalarSource, FlattenScriptObjectSource, FloatTensorSource, FSDPNNModuleSource, @@ -1719,6 +1720,14 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager: example_value=example_value, guard_manager_enum=guard_manager_enum, ) + elif istype(source, DynamicScalarSource): + assert base_guard_manager + out = base_guard_manager.lambda_manager( + python_lambda=lambda x: int(x), + source=source_name, + example_value=example_value, + guard_manager_enum=guard_manager_enum, + ) else: raise AssertionError( f"missing guard manager builder {source} - {source.name()}" diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py index 13b3de0280e2..8c0c7633fa6a 100644 --- a/torch/_dynamo/output_graph.py +++ b/torch/_dynamo/output_graph.py @@ -431,6 +431,7 @@ def __init__( f_code: CodeType, torch_function_mode_stack: list[torch.overrides.TorchFunctionMode], package: Optional["CompilePackage"], + one_graph: bool = False, ) -> None: super().__init__( local_scope, @@ -487,8 +488,10 @@ def __init__( # TrackedFake instances may have its metadata changed throughout # the program execution. tracked_fakes=self.tracked_fakes, - allow_scalar_outputs=config.capture_scalar_outputs, - allow_dynamic_output_shape_ops=config.capture_dynamic_output_shape_ops, + # We want to allow capture scalar outputs and allow_dynamic_output_shape_ops when fullgraph=True + allow_scalar_outputs=one_graph or config.capture_scalar_outputs, + allow_dynamic_output_shape_ops=one_graph + or config.capture_dynamic_output_shape_ops, prefer_deferred_runtime_asserts_over_guards=config.prefer_deferred_runtime_asserts_over_guards, co_fields=self.co_fields, ) @@ -2695,6 +2698,9 @@ def __init__( # tracer is the current tracer that's readily accessible in current tracer's graph. self.bound_symbols: dict[sympy.Symbol, Union[torch.fx.Proxy, LazyProxy]] = {} + # Maps _DynamicScalar object ids to allocated SymInt nodes, for symbol reuse + self.dynamic_scalar_nodes: dict[int, torch.SymInt] = {} + self.prev_inst = None # True if this tracer is currently tracing into torch.utils.checkpoint # as part of speculate_subgraph. diff --git a/torch/_dynamo/package.py b/torch/_dynamo/package.py index e8113915008d..49eb86e9779b 100644 --- a/torch/_dynamo/package.py +++ b/torch/_dynamo/package.py @@ -19,6 +19,7 @@ import logging import os import pickle +import platform import shutil import sys import types @@ -27,16 +28,8 @@ from typing_extensions import Never import torch -import torch._inductor.package from torch._dynamo.exc import PackageError from torch._dynamo.graph_utils import _graph_device_type -from torch._dynamo.precompile_context import ( - PrecompileCacheArtifact, - PrecompileContext, - SystemInfo, -) -from torch._inductor.runtime.cache_dir_utils import cache_dir -from torch.compiler._cache import CacheArtifactFactory from .bytecode_transformation import get_code_keys from .utils import dynamo_timed, increment_frame @@ -304,12 +297,94 @@ def _find_code_source(obj: Any) -> Optional[str]: return toplevel.__qualname__, code_source.strip(".") +@dataclasses.dataclass(frozen=True) +class SystemInfo: + """ + System information including Python, PyTorch, and GPU details. + This information is used to ensure compiled artifacts can only be loaded + with compatible system configurations. + """ + + python_version: str + torch_version: str + toolkit_version: Optional[str] + triton_version: Optional[tuple[int, int]] + gpu_name: Optional[str] + CHECK_GPUS = ("cuda", "xpu") + + @classmethod + def current(cls) -> "SystemInfo": + """Create a SystemInfo instance with current system information.""" + # Get GPU name if CUDA or XPU is available + from torch.utils._triton import get_triton_version + + gpu_name, toolkit_version = None, None + for device_type in cls.CHECK_GPUS: + if getattr(torch, device_type).is_available(): + try: + gpu_name = getattr(torch, device_type).get_device_name() + toolkit_version = getattr(torch.version, device_type) + break + except Exception: + pass + + return cls( + python_version=platform.python_version(), + torch_version=torch.__version__, + toolkit_version=toolkit_version, + triton_version=get_triton_version((0, 0)), + gpu_name=gpu_name, + ) + + def check_compatibility( + self, other: "SystemInfo", device_type: str = "cpu" + ) -> None: + """ + Check if this SystemInfo is compatible with another SystemInfo. + Raises RuntimeError if incompatible. + """ + if self.python_version != other.python_version: + raise RuntimeError( + f"Compile package was created with a different Python version: {self.python_version}" + ) + + if self.torch_version != other.torch_version: + raise RuntimeError( + f"Compile package was created with a different PyTorch version: {self.torch_version}" + ) + if device_type in self.CHECK_GPUS: + if not getattr(torch, device_type).is_available(): + raise RuntimeError(f"{device_type} is not available") + + if self.toolkit_version != other.toolkit_version: + raise RuntimeError( + f"Compile package was created with a different toolkit version: {self.toolkit_version}" + ) + + if ( + other.triton_version != (0, 0) + and self.triton_version != other.triton_version + ): + raise RuntimeError( + f"Compile package was created with a different Triton version: {self.triton_version}" + ) + + # Check GPU name if CUDA/XPU was used + if other.gpu_name is not None and self.gpu_name != other.gpu_name: + raise RuntimeError( + f"Compile package was created with different GPU: " + f"cached={self.gpu_name}, current={other.gpu_name}" + ) + + @dataclasses.dataclass class _DynamoCacheEntry: codes: list[_DynamoCodeCacheEntry] source_info: SourceInfo device_type: str system_info: SystemInfo = dataclasses.field(default_factory=SystemInfo.current) + fn_name: Optional[str] = None + fn_first_lineno: Optional[str] = None @property def backend_ids(self) -> set[_BackendId]: @@ -320,15 +395,15 @@ def check_versions(self) -> None: current_system_info = SystemInfo.current() self.system_info.check_compatibility(current_system_info, self.device_type) - -@CacheArtifactFactory.register -class _DynamoCacheArtifact(PrecompileCacheArtifact[_DynamoCacheEntry]): - @staticmethod - def type() -> str: - return "precompile_dynamo" - - def after_deserialization(self) -> _DynamoCacheEntry: - return pickle.loads(self.content) + def debug_info(self) -> dict[str, Any]: + assert len(self.codes) > 0 + return { + "num_codes": str(len(self.codes)), + "fn_name": self.fn_name, + "fn_first_lineno": self.fn_first_lineno, + "device_type": self.device_type, + "backend_ids": list(self.backend_ids), + } def _hash_source(source: str) -> str: @@ -691,10 +766,13 @@ def install(self, backends: dict[_BackendId, Any]) -> None: def cache_entry(self) -> _DynamoCacheEntry: self.validate() + assert self._innermost_fn is not None return _DynamoCacheEntry( codes=list(self._codes.values()), source_info=self._source_info, device_type=self._device_type, + fn_name=self._innermost_fn.__qualname__, + fn_first_lineno=self._innermost_fn.__code__.co_firstlineno, ) @staticmethod @@ -709,17 +787,7 @@ def source_id_from_fn(fn: Callable[..., Any]) -> str: return sha256_hash.hexdigest() -@CacheArtifactFactory.register -class EagerCacheArtifact(PrecompileCacheArtifact[Any]): - @staticmethod - def type() -> str: - return "precompile_eager" - - def after_deserialization(self) -> Any: - return pickle.loads(self.content) - - -_Backends = dict[_BackendId, PrecompileCacheArtifact[Any]] +_Backends = dict[_BackendId, Any] class DynamoStore(abc.ABC): @@ -733,16 +801,22 @@ def record_package(self, package: CompilePackage) -> None: """ Records a package to PrecompileContext, so that it can be serialized later. """ + from torch._dynamo.precompile_context import PrecompileContext + cache_entry = package.cache_entry() - pickled_result = pickle.dumps(cache_entry) - PrecompileContext.record_artifact( - _DynamoCacheArtifact.type(), key=package.source_id, content=pickled_result + PrecompileContext.record_dynamo_cache_entry( + cache_entry=cache_entry, key=package.source_id ) def record_eager_backend(self, backend_id: _BackendId, backend: Any) -> None: """ Records eager fx graphs to PrecompileContext for testing purposes. """ + from torch._dynamo.precompile_context import ( + EagerCacheArtifact, + PrecompileContext, + ) + pickled_result = pickle.dumps(backend) PrecompileContext.record_artifact( EagerCacheArtifact.type(), key=backend_id, content=pickled_result @@ -772,6 +846,11 @@ def save_cache_entry(self, cache_entry: _DynamoCacheEntry, key: str) -> None: """ Saves a package to a given path. Grabs backends from PrecompileContext. """ + from torch._dynamo.precompile_context import ( + PrecompileCacheArtifact, + PrecompileContext, + ) + backend_content: _Backends = {} for backend_id in cache_entry.backend_ids: serialized_backend = PrecompileContext.serialize_artifact_by_key(backend_id) @@ -808,6 +887,8 @@ def read(self, path: str) -> tuple[_DynamoCacheEntry, _Backends]: def load_cache_entry( self, key: str ) -> tuple[_DynamoCacheEntry, dict[_BackendId, Any]]: + from torch._dynamo.precompile_context import PrecompileContext + cache_entry, backend_content = self.read(key) for backend_id, backend in backend_content.items(): PrecompileContext.record_artifact( @@ -964,4 +1045,10 @@ def load_and_install_package( return package +def cache_dir() -> str: + from torch._inductor.runtime.cache_dir_utils import cache_dir + + return cache_dir() + + DynamoCache = DiskDynamoCache(os.path.join(cache_dir(), "dynamo")) diff --git a/torch/_dynamo/precompile_context.py b/torch/_dynamo/precompile_context.py index a032ad996810..a9ec513a3a41 100644 --- a/torch/_dynamo/precompile_context.py +++ b/torch/_dynamo/precompile_context.py @@ -1,8 +1,7 @@ import copy -import dataclasses +import json import logging import pickle -import platform from abc import abstractmethod from collections import defaultdict from itertools import chain @@ -10,6 +9,7 @@ from typing_extensions import override import torch +from torch._dynamo.package import _DynamoCacheEntry from torch.compiler._cache import ( _serialize_single_cache, CacheArtifact, @@ -20,7 +20,6 @@ ) from torch.utils._appending_byte_serializer import AppendingByteSerializer from torch.utils._ordered_set import OrderedSet -from torch.utils._triton import get_triton_version """ @@ -69,6 +68,16 @@ def after_deserialization(self) -> T: ... +@CacheArtifactFactory.register +class EagerCacheArtifact(PrecompileCacheArtifact[Any]): + @staticmethod + def type() -> str: + return "precompile_eager" + + def after_deserialization(self) -> Any: + return pickle.loads(self.content) + + class EditablePrecompileCacheArtifact(Generic[T]): """ A PrecompileCacheArtifact whose content isn't encoded until we call PrecompileContext.serialize() @@ -99,6 +108,21 @@ def edit_contents(self, edit_fn: Callable[..., Any]) -> None: self.content = edit_fn(self.content) +@CacheArtifactFactory.register +class _DynamoCacheArtifact(PrecompileCacheArtifact[_DynamoCacheEntry]): + @staticmethod + def type() -> str: + return "precompile_dynamo" + + def after_deserialization(self) -> _DynamoCacheEntry: + result = pickle.loads(self.content) + return result + + +class BypassDynamoCacheEntry(Exception): + pass + + class PrecompileContext(CacheArtifactManager): """ PrecompileContext is a special CacheArtifactManager for handling precompilation @@ -106,20 +130,29 @@ class PrecompileContext(CacheArtifactManager): of placing each artifact into respective caches, it will stitch all the cache artifacts for a single key together and place it into a global Precompile Cache. + PrecompileContext has two main portions: dynamo_cache_entries and backend_cache_artifacts. + When saving, PrecompileContext.serialize() will serialize all dynamo cache entries along with any PrecompileCacheArtifacts that + are needed to save those dynamo cache entries. + The following artifact types are supported by PrecompileContext: - BundledAOTAutogradCacheArtifact - - DynamoCodeStateArtifact - AutotuneCacheArtifact (regular autotune results, same as Megacache) + """ # Protected by the compile_lock - # _new_cache_artifacts_by_key organizes results by the key of each artifact. + # _backend_artifacts_by_key organizes results by the key of each artifact. # This allows us to implement serialize_by_key easily. - # On call to `serialize()`, all cache artifacts in _new_cache_artifacts_by_key + # On call to `serialize()`, all cache artifacts in _backend_artifacts_by_key # are transferred to _new_cache_artifacts before serialization. - _new_cache_artifacts_by_key: dict[ + _backend_artifacts_by_key: dict[ str, Union[EditablePrecompileCacheArtifact[object], CacheArtifact] ] = {} + + # On call to `serialize()`, all cache artifacts in _dynamo_cache_entries are converted + # into DynamoCacheArtifacts and added to _new_cache_artifacts for serialization + _dynamo_cache_entries: dict[str, _DynamoCacheEntry] = {} + _new_cache_artifacts: CacheArtifactsResult = defaultdict(list) # Keep a separate seen artifacts list to make avoid unnecessary duplicates # This list will not be cleared between serialize() calls @@ -134,7 +167,8 @@ class PrecompileContext(CacheArtifactManager): @classmethod def clear(cls) -> None: - cls._new_cache_artifacts_by_key.clear() + cls._backend_artifacts_by_key.clear() + cls._dynamo_cache_entries.clear() super().clear() @override @@ -164,29 +198,51 @@ def record_artifact( return cls._seen_artifacts.add(artifact) - cls._new_cache_artifacts_by_key[key] = artifact + cls._backend_artifacts_by_key[key] = artifact + + @classmethod + def record_dynamo_cache_entry( + cls, cache_entry: _DynamoCacheEntry, key: str + ) -> None: + cls._dynamo_cache_entries[key] = cache_entry @classmethod def _save_artifacts_by_type(cls) -> None: """ We normally record artifacts by key, but serialization expects them to be organized - by artifact type. This function transfers artifacts from _new_cache_artifacts_by_key to _new_cache_artifacts + by artifact type. This function transfers artifacts from _backend_artifacts_by_key to _new_cache_artifacts """ - for artifact in cls._new_cache_artifacts_by_key.values(): + for key, cache_entry in cls._dynamo_cache_entries.items(): + backends = cache_entry.backend_ids + try: + for id_ in backends: + if id_ not in cls._backend_artifacts_by_key: + logger.warning( + "Bypassing %s because backend %s not found in artifacts" + ) + raise BypassDynamoCacheEntry + except BypassDynamoCacheEntry: + continue + pickled_result = pickle.dumps(cache_entry) + dynamo_artifact = _DynamoCacheArtifact(key, pickled_result) + cls._new_cache_artifacts[_DynamoCacheArtifact.type()].append( + dynamo_artifact + ) + + # Save all the backend artifacts + for artifact in cls._backend_artifacts_by_key.values(): if isinstance(artifact, EditablePrecompileCacheArtifact): artifact = artifact.real_encode() cls._new_cache_artifacts[artifact.__class__.type()].append(artifact) - cls._new_cache_artifacts_by_key.clear() + cls._backend_artifacts_by_key.clear() @classmethod def edit_artifact(cls, key: str, edit_fn: Callable[..., Any]) -> None: """ Edit the content of an existing artifact """ - assert key in cls._new_cache_artifacts_by_key, ( - f"Key {key} not found in artifacts" - ) - artifact = cls._new_cache_artifacts_by_key[key] + assert key in cls._backend_artifacts_by_key, f"Key {key} not found in artifacts" + artifact = cls._backend_artifacts_by_key[key] assert isinstance(artifact, EditablePrecompileCacheArtifact), ( "Artifact is not editable" ) @@ -195,133 +251,157 @@ def edit_artifact(cls, key: str, edit_fn: Callable[..., Any]) -> None: @classmethod def serialize_artifact_by_key(cls, key: str) -> Optional[CacheArtifact]: """ - Serialize all artifacts with the given key returned in a list. + Serialize all backend artifacts with the given key returned in a list. """ - result = cls._new_cache_artifacts_by_key.get(key, None) + result = cls._backend_artifacts_by_key.get(key, None) if isinstance(result, EditablePrecompileCacheArtifact): result = result.real_encode() return result @classmethod def serialize(cls) -> Optional[tuple[bytes, CacheInfo]]: - cls._save_artifacts_by_type() - # No need to serialize if there are no new dynamo compiles - if "precompile_dynamo" not in cls._new_cache_artifacts: + if not cls._dynamo_cache_entries: return None - return super().serialize() + + debug_info = cls.dump_debug_info( + cls._dynamo_cache_entries, cls._backend_artifacts_by_key + ) + artifacts = json.dumps({"artifacts": debug_info}) + torch._logging.trace_structured( + "artifact", + metadata_fn=lambda: { + "name": "dynamo_cache_save_contents", + "encoding": "json", + }, + payload_fn=lambda: artifacts, + expect_trace_id=False, + ) + cls._save_artifacts_by_type() + + result = super().serialize() + assert result is not None + data, info = result + + return data, info + + @staticmethod + def dump_debug_info( + dynamo_entries: dict[str, _DynamoCacheEntry], + backend_artifacts: dict[ + str, Union[EditablePrecompileCacheArtifact[object], CacheArtifact] + ], + ) -> dict[str, Any]: + """ + Return a JSON serializable debug dump of all entries in the precompile context + Called in serialize before serialization, and in populate_caches after deserialization + """ + # Print debug information + debug_info: defaultdict[str, list[Any]] = defaultdict(list) + for key, cache_entry in dynamo_entries.items(): + info = cache_entry.debug_info() + info["key"] = key + debug_info["precompile_dynamo"].append(info) + + for artifact in backend_artifacts.values(): + if isinstance(artifact, EditablePrecompileCacheArtifact): + debug_info[artifact.artifact_type].append(artifact.key) + else: + debug_info[artifact.__class__.type()].append(artifact.key) + + return debug_info @staticmethod def populate_caches(artifacts: CacheArtifactsResult) -> CacheInfo: PrecompileContext._ensure_cache_artifacts_registered() - artifacts_by_key = {} + backend_artifacts: dict[str, Any] = {} + dynamo_entries: dict[str, _DynamoCacheEntry] = {} cache_info = CacheInfo() for artifact in chain(*artifacts.values()): if artifact.type() == "autotune": # Populate autotune cache artifacts artifact.populate_cache() + elif artifact.type() == "precompile_dynamo": + assert isinstance(artifact, _DynamoCacheArtifact) + cache_entry: _DynamoCacheEntry = artifact.after_deserialization() + dynamo_entries[artifact.key] = cache_entry else: - artifacts_by_key[artifact.key] = artifact + backend_artifacts[artifact.key] = artifact cache_info.add(artifact) + num_artifacts = len(artifacts["precompile_dynamo"]) + + debug_info = PrecompileContext.dump_debug_info( + dynamo_entries, backend_artifacts + ) + debug_str = json.dumps( + { + "num_entries": num_artifacts, + "artifacts": debug_info, + }, + ) + torch._logging.trace_structured( + "artifact", + metadata_fn=lambda: { + "name": "dynamo_cache_entries", + "encoding": "json", + }, + payload_fn=lambda: debug_str, + expect_trace_id=False, + ) from torch._dynamo.package import _BackendId, DynamoCache - for dynamo_entry in artifacts["precompile_dynamo"]: - assert isinstance(dynamo_entry, PrecompileCacheArtifact) - cache_entry = dynamo_entry.after_deserialization() - # Grab backends from the dynamo cache entry - backends = cache_entry.backend_ids - backend_content: dict[_BackendId, PrecompileCacheArtifact[Any]] = {} - for id_ in backends: - assert id_ in artifacts_by_key, f"Backend {id_} not found in artifacts" - artifact = artifacts_by_key[id_] - assert isinstance(artifact, PrecompileCacheArtifact) - backend_content[id_] = artifact - DynamoCache.write(cache_entry, backend_content, dynamo_entry.key) + for key, cache_entry in dynamo_entries.items(): + try: + backends = cache_entry.backend_ids + backend_content: dict[_BackendId, PrecompileCacheArtifact[Any]] = {} + for id_ in backends: + if id_ not in backend_artifacts: + debug_str = json.dumps( + { + "entry": cache_entry.debug_info, + "key": key, + } + ) + logger.warning("Backend not found") + torch._logging.trace_structured( + "artifact", + metadata_fn=lambda: { + "name": "dynamo_cache_bypass", + "encoding": "json", + }, + payload_fn=lambda: debug_str, + expect_trace_id=False, + ) + continue + artifact = backend_artifacts[id_] + assert isinstance(artifact, PrecompileCacheArtifact) + backend_content[id_] = artifact + DynamoCache.write(cache_entry, backend_content, key) + except Exception as e: + logger.warning("Failed to deserialize cache entry %s: %s", key, str(e)) + + error = e + data = json.dumps( + { + "key": key, + "error": str(error), + } + ) + torch._logging.trace_structured( + "artifact", + metadata_fn=lambda: { + "name": "dynamo_cache_exception", + "encoding": "json", + }, + payload_fn=lambda: data, + ) + continue return cache_info @classmethod def _ensure_cache_artifacts_registered(cls) -> None: - from torch._dynamo.package import _DynamoCacheArtifact # noqa: F401 from torch._functorch._aot_autograd.autograd_cache import ( # noqa: F401 BundledAOTAutogradCacheArtifact, ) - - -@dataclasses.dataclass(frozen=True) -class SystemInfo: - """ - System information including Python, PyTorch, and GPU details. - This information is used to ensure compiled artifacts can only be loaded - with compatible system configurations. - """ - - python_version: str - torch_version: str - toolkit_version: Optional[str] - triton_version: Optional[tuple[int, int]] - gpu_name: Optional[str] - CHECK_GPUS = ("cuda", "xpu") - - @classmethod - def current(cls) -> "SystemInfo": - """Create a SystemInfo instance with current system information.""" - # Get GPU name if CUDA or XPU is available - gpu_name, toolkit_version = None, None - for device_type in cls.CHECK_GPUS: - if getattr(torch, device_type).is_available(): - try: - gpu_name = getattr(torch, device_type).get_device_name() - toolkit_version = getattr(torch.version, device_type) - break - except Exception: - pass - - return cls( - python_version=platform.python_version(), - torch_version=torch.__version__, - toolkit_version=toolkit_version, - triton_version=get_triton_version((0, 0)), - gpu_name=gpu_name, - ) - - def check_compatibility( - self, other: "SystemInfo", device_type: str = "cpu" - ) -> None: - """ - Check if this SystemInfo is compatible with another SystemInfo. - Raises RuntimeError if incompatible. - """ - if self.python_version != other.python_version: - raise RuntimeError( - f"Compile package was created with a different Python version: {self.python_version}" - ) - - if self.torch_version != other.torch_version: - raise RuntimeError( - f"Compile package was created with a different PyTorch version: {self.torch_version}" - ) - if device_type in self.CHECK_GPUS: - if not getattr(torch, device_type).is_available(): - raise RuntimeError(f"{device_type} is not available") - - if self.toolkit_version != other.toolkit_version: - raise RuntimeError( - f"Compile package was created with a different toolkit version: {self.toolkit_version}" - ) - - if ( - other.triton_version != (0, 0) - and self.triton_version != other.triton_version - ): - raise RuntimeError( - f"Compile package was created with a different Triton version: {self.triton_version}" - ) - - # Check GPU name if CUDA/XPU was used - if other.gpu_name is not None and self.gpu_name != other.gpu_name: - raise RuntimeError( - f"Compile package was created with different GPU: " - f"cached={self.gpu_name}, current={other.gpu_name}" - ) diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py index b17ccfe09dae..559972464f82 100644 --- a/torch/_dynamo/source.py +++ b/torch/_dynamo/source.py @@ -526,6 +526,29 @@ def name(self) -> str: return f"cast_symbool_to_symint_guardless({self.base.name()})" +@dataclasses.dataclass(frozen=True) +class DynamicScalarSource(ChainedSource): + is_int: bool + + def __post_init__(self) -> None: + assert self.base is not None + + def reconstruct(self, codegen: "PyCodegen") -> None: + # Integer casting at reconstruction helps reduce the amount of DynamicInts returned + # to the user, in favor of plain ints. + # For example, a compiled region that only does int arithmetic could return a + # DynamicInt without the casting here. + codegen.add_push_null(lambda: codegen.load_import_from("builtins", "int")) + codegen(self.base) + codegen.extend_output(create_call_function(1, False)) + + def guard_source(self) -> GuardSource: + return self.base.guard_source() + + def name(self) -> str: + return f"int({self.base.name()})" + + @dataclasses.dataclass(frozen=True) class FlattenScriptObjectSource(ChainedSource): def __post_init__(self) -> None: diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py index efb27fc2903d..b762bd95fa11 100644 --- a/torch/_dynamo/symbolic_convert.py +++ b/torch/_dynamo/symbolic_convert.py @@ -3847,6 +3847,7 @@ def __init__( global_scope=f_globals, f_code=f_code, torch_function_mode_stack=torch_function_mode_stack, + one_graph=one_graph, package=package, ), instructions=instructions, diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py index 660042b33b87..547e826f3431 100644 --- a/torch/_dynamo/variables/builder.py +++ b/torch/_dynamo/variables/builder.py @@ -60,6 +60,7 @@ from torch._utils_internal import justknobs_check from torch.fx.experimental._backward_state import BackwardState from torch.fx.experimental._dynamism import normalize_source_name +from torch.fx.experimental.sym_node import _DynamicScalar, DynamicInt from torch.fx.experimental.symbolic_shapes import ( _constrain_range_for_size, _nested_int_aware_sort, @@ -101,6 +102,7 @@ ConvertIntSource, DictGetItemSource, DictSubclassGetItemSource, + DynamicScalarSource, FloatTensorSource, GetItemSource, GradSource, @@ -456,7 +458,9 @@ def _is_deduplicable_sym_variable(value, vt): # should NOT track them. If we use a single SymNodeVariable instance to track them # across multiple uses, then guards created for one usage will incorrectly apply to # all other usages of that constant, leading to unnecessary recompilations. - return is_torch_sym(value) and isinstance(vt, SymNodeVariable) + return ( + is_torch_sym(value) or isinstance(value, _DynamicScalar) + ) and isinstance(vt, SymNodeVariable) if ( ( @@ -1103,6 +1107,46 @@ def build_key_value(i, k, v): ): self.install_guards(GuardBuilder.FUNCTION_MATCH) return ItertoolsVariable(value, source=self.source) + elif isinstance(value, _DynamicScalar): + is_int = isinstance(value, DynamicInt) + source = DynamicScalarSource(self.source, is_int) + if id(value) in self.tx.output.root_tracer.dynamic_scalar_nodes: + # If we've already seen this dynamic scalar, reuse the existing + # SymInt/SymFloat node. + node = self.tx.output.root_tracer.dynamic_scalar_nodes[id(value)] + else: + sym = self.tx.output.shape_env.create_unspecified_symbol( + value.real, + source=source, + dynamic_dim=DimDynamic.DYNAMIC, + ) + node = self.tx.output.shape_env.create_symintnode( + sym, + hint=value.real, + source=source, + ) + + # Bind to graph input + sym_node_proxy = self.tx.output.root_tracer.create_graph_input( + re.sub(r"[^a-zA-Z0-9]+", "_", self.name), + type(node), + node, + source=source, + ) + sym_node_proxy.node.meta["grapharg"] = GraphArg( + source, + node, + False, + None, + is_tensor=False, + example_strong_ref=node, + ) + sym_expr = node.node.expr + assert isinstance(sym_expr, sympy.Symbol), ( + f"{sym_expr} is not a basic Symbol." + ) + self.tx.output.tracked_fakes.append(TrackedFake(node, source, None)) + return SymNodeVariable(sym_node_proxy, node) elif is_torch_sym(value): # Note: this doesn't handle nested symints. # For SymBool input, we reuse the infra for SymInt by simulating SymBool with a SymInt in dynamo. diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py index 9bef1aecc342..63d70235d828 100644 --- a/torch/_dynamo/variables/lists.py +++ b/torch/_dynamo/variables/lists.py @@ -1230,6 +1230,26 @@ def reconstruct(self, codegen: "PyCodegen") -> None: codegen.extend_output(create_rot_n(2)) codegen.store_attr(name) + def _is_method_overridden(self, method_name: str) -> bool: + """Checks if a method is overridden in the NamedTuple subclass. + + Args: + method_name (str): The name of the method to check. + + Returns: + bool: True if the method is overridden in the subclass, False otherwise. + + Raises: + ValueError: If the NamedTuple class does not inherit from both Tuple and Object. + """ + if len(self.tuple_cls.__mro__) < 3: + raise ValueError("NamedTuple should inherit from Tuple and Object.") + if getattr(self.tuple_cls, method_name, None) == getattr( + self.tuple_cls.__mro__[-3], method_name, None + ): + return False + return True + def call_method( self, tx, @@ -1257,6 +1277,44 @@ def call_method( tx.output.side_effects.store_attr(self, attr, value) self.dynamic_attributes[attr] = value return ConstantVariable.create(None) + elif name == "_replace": + # NamedTuple._replace should create a new instance with replaced fields + if args: + raise_observed_exception( + TypeError, + tx, + args=[ + ConstantVariable.create( + "_replace() takes no positional arguments" + ) + ], + ) + + # Get the field names for validation + fields = self.fields() + + # Start with current items (copy them) + new_items = list(self.items) + + # Replace fields specified in kwargs + for field_name, new_value in kwargs.items(): + if field_name not in fields: + raise_observed_exception( + ValueError, + tx, + args=[ + ConstantVariable.create( + f"Got unexpected field name: '{field_name}'" + ) + ], + ) + + # Replace the item at the field's index + field_index = fields.index(field_name) + new_items[field_index] = new_value + + return NamedTupleVariable(new_items, self.tuple_cls) + return super().call_method(tx, name, args, kwargs) def var_getattr(self, tx: "InstructionTranslator", name): @@ -1275,6 +1333,23 @@ def check_and_create_method(): else: return None + # Avoid UserMethodVariable fallback precisely when methods NamedTuple methods have not been overwritten. + if ( + name == "_replace" + and not self._is_method_overridden("_replace") + and not self._is_method_overridden("__getattr__") + ): + # Return a BuiltinVariable for the _replace method + # Get the actual _replace method from the tuple class + actual_replace_method = getattr(self.tuple_cls, "_replace", None) + if actual_replace_method: + from ..source import AttrSource + + source = AttrSource(self.source, name) if self.source else None + return variables.GetAttrVariable(self, name, source=source) + # Fallback if _replace doesn't exist (shouldn't happen for proper NamedTuples) + return super().var_getattr(tx, name) + if name == "_fields": source = NamedTupleFieldsSource(self.source) if self.source else None return VariableTracker.build(tx, self.fields(), source=source) diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py index 08dab47451ab..5dea5cd35d17 100644 --- a/torch/_dynamo/variables/tensor.py +++ b/torch/_dynamo/variables/tensor.py @@ -999,7 +999,11 @@ def method_data_ptr(self, *args, **kwargs): return DataPtrVariable(self) def method_item(self, *args, **kwargs): - if not config.capture_scalar_outputs: + from ..symbolic_convert import InstructionTranslator + + tx = InstructionTranslator.current_tx() + # We enable capture_scalar_outputs when full_graph=True by default. + if not tx.one_graph and not config.capture_scalar_outputs: self._warn_capture_scalar_outputs() unimplemented_v2( gb_type="Unsupported Tensor.item() call with capture_scalar_outputs=False", diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py index cda24724575e..dce60af39b36 100644 --- a/torch/_inductor/codecache.py +++ b/torch/_inductor/codecache.py @@ -1252,7 +1252,10 @@ def cache_hit_post_compile( ) trace_structured( "inductor_output_code", - lambda: {"filename": artifact_path}, + lambda: { + "filename": artifact_path, + "file_path": os.path.abspath(artifact_path), + }, payload_fn=lambda: code, ) trace_structured( diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py index 605b93dff592..a2beb9ecfc42 100644 --- a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py +++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py @@ -38,7 +38,7 @@ if config.is_fbcode(): import python_cutlass # type: ignore[import-untyped, import-not-found] # noqa: F401 else: - import cutlass as python_cutlass # type: ignore[import-untyped, import-not-found] # noqa: F401 + import cutlass_cppgen as python_cutlass # type: ignore[import-untyped, import-not-found] # noqa: F401 from torch._inductor.codegen.cuda import cuda_env from torch._inductor.utils import IndentedBuffer @@ -174,7 +174,7 @@ def _render_argument_type( def is_nested_visitor_type(t: type) -> bool: return ".".join([t.__module__, t.__qualname__]) in { "python_cutlass.backend.c_types.visitor_factory..VisitorType", - "cutlass.backend.c_types.visitor_factory..VisitorType", + "cutlass_cppgen.backend.c_types.visitor_factory..VisitorType", } buffer = IndentedBuffer() @@ -235,7 +235,7 @@ def _get_arg_from_node( # Once again, need to check for local class type for stride tuple if str(arg_ty) in { ".TupleType'>", - ".TupleType'>", + ".TupleType'>", }: DEFAULT_STRIDE_LEN = 3 assert len(node.get_layout().stride) <= DEFAULT_STRIDE_LEN diff --git a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py index 7ca33ea779cc..bdbbbe58a3b1 100644 --- a/torch/_inductor/codegen/cuda/cutlass_utils.py +++ b/torch/_inductor/codegen/cuda/cutlass_utils.py @@ -43,7 +43,7 @@ def move_cutlass_compiled_cache() -> None: if config.is_fbcode(): import python_cutlass # type: ignore[import-not-found] else: - import cutlass as python_cutlass # type: ignore[import-not-found] # noqa: F401 + import cutlass_cppgen as python_cutlass # type: ignore[import-not-found] # noqa: F401 # Check if the CACHE_FILE attribute exists in python_cutlass and if the file exists if not hasattr(python_cutlass, "CACHE_FILE") or not os.path.exists( @@ -118,7 +118,7 @@ def path_join(path0, path1): tmp_cutlass_full_path = os.path.abspath(os.path.join(cache_dir(), "torch_cutlass")) dst_link_library = path_join(tmp_cutlass_full_path, "cutlass_library") - dst_link_cutlass = path_join(tmp_cutlass_full_path, "cutlass") + dst_link_cutlass = path_join(tmp_cutlass_full_path, "cutlass_cppgen") dst_link_pycute = path_join(tmp_cutlass_full_path, "pycute") # mock modules to import cutlass @@ -156,7 +156,7 @@ def link_and_append(dst_link, src_path, parent_dir): ) try: - import cutlass # noqa: F401, F811 + import cutlass_cppgen # noqa: F401, F811 import cutlass_library.generator # noqa: F401 import cutlass_library.library # noqa: F401 import cutlass_library.manifest # noqa: F401 diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py index bcc7033a2848..3e923be85319 100644 --- a/torch/_inductor/codegen/triton.py +++ b/torch/_inductor/codegen/triton.py @@ -2743,6 +2743,8 @@ def decide_later(): dtype = torch.bool load_buffer = self.get_load_buffer(indexing) + if config.triton.enable_pdl: + load_buffer.writeline("tl.extra.cuda.gdc_wait()") result_var = self.cse.generate( load_buffer, make_line(line), dtype=dtype, shape=shape ) @@ -4395,6 +4397,9 @@ def add_constexpr_arg(arg_name): triton_meta["configs"] = [config_of(signature)] + if config.triton.enable_pdl: + triton_meta["launch_pdl"] = True + # Triton compiler includes equal_to_1 args into constants even # when they are not constexpr. otherwise there may be a segfault # during launching the Inductor-compiled Triton kernel. diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py index bea2ab0bbce9..888aa8afa483 100644 --- a/torch/_inductor/config.py +++ b/torch/_inductor/config.py @@ -1460,6 +1460,9 @@ class triton: os.environ.get("TORCHINDUCTOR_DECOMPOSE_K_THRESHOLD", "32") ) + # Programmatic Dependent Launch improves launch latency on Nvidia Hopper+ devices + enable_pdl = False + class aot_inductor: """ diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py index 62f04504b606..8820323b3def 100644 --- a/torch/_inductor/fx_passes/quantization.py +++ b/torch/_inductor/fx_passes/quantization.py @@ -1112,9 +1112,7 @@ def fn(match): and w2.dtype == torch.int8 and w3.dtype == torch.int8 and scales.dtype == torch.bfloat16 - # _weight_int8pack_mm kernel only supports cpu now - # TODO: add cuda kernel support instead of calling mul+sum - and x.device.type == "cpu" + and x.device.type in ("cpu", "cuda") and x.device == w1.device and w1.device == w2.device and w2.device == w3.device diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py index d10dc7a46426..a921f1af1f79 100644 --- a/torch/_inductor/graph.py +++ b/torch/_inductor/graph.py @@ -2390,7 +2390,10 @@ def _compile_to_module_lines( else: trace_structured( "inductor_output_code", - lambda: {"filename": path}, + lambda: { + "filename": path, + "file_path": os.path.abspath(path), + }, payload_fn=lambda: wrapper_code.value, ) with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True): diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py index cd6139cfd513..77500888d223 100644 --- a/torch/_inductor/ir.py +++ b/torch/_inductor/ir.py @@ -8199,12 +8199,23 @@ def realize_hint(self) -> None: self.realize() def has_accumulated_enough_reads_by_size(self, threshold: int) -> bool: - size_of_reads = [V.graph.get_dep_size_hint(dep) for dep in self.get_reads()] + from torch._inductor.utils import is_nonfreeable_buffers + + size_of_reads = [ + V.graph.get_dep_size_hint(dep) + for dep in self.get_reads() + if not is_nonfreeable_buffers(dep) + ] if not size_of_reads: return False total_size = sum(size_of_reads) max_size = max(size_of_reads) - return total_size > threshold and total_size / max_size >= 2 + min_size = min(size_of_reads) + return ( + total_size >= threshold + and total_size / max_size >= 2 + and max_size == min_size + ) def has_exceeded_max_reads(self) -> bool: return isinstance(self.data, Pointwise) and ( diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py index 6c5e8ad1ca8b..d3d24bfef777 100644 --- a/torch/_inductor/lowering.py +++ b/torch/_inductor/lowering.py @@ -7220,9 +7220,8 @@ def prepare_softmax_online(x, dim): reduction_numel=rnumel, ) - if ( - num_split == 1 - and V.graph.sizevars.size_hint(rnumel) >= config.unroll_reductions_threshold + if num_split == 1 and V.graph.sizevars.statically_known_geq( + rnumel, config.unroll_reductions_threshold ): max_tensor, sum_tensor = OnlineSoftmaxReduction.create( input_node=x, num_output=2, reduction_hint=hint, **kwargs diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py index 6d978af8d772..d99f51d250e3 100644 --- a/torch/_inductor/runtime/triton_heuristics.py +++ b/torch/_inductor/runtime/triton_heuristics.py @@ -764,6 +764,15 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]: ), } ) + if self.device_props.type == "cuda": + options.update( + { + "launch_cooperative_grid": compile_meta.get( + "launch_cooperative_grid", False + ), + "launch_pdl": compile_meta.get("launch_pdl", False), # True + } + ) if self.device_props.type == "hip": if "waves_per_eu" in compile_meta: options["waves_per_eu"] = compile_meta["waves_per_eu"] @@ -1514,6 +1523,13 @@ def check_can_launch() -> StaticallyLaunchedCudaKernel: # Requires storing the entire binary raise CannotStaticallyLaunchKernel("store_cubin is enabled") + if getattr(kernel.metadata, "launch_pdl", False) or getattr( + kernel.metadata, "launch_cooperative_grid", False + ): + raise CannotStaticallyLaunchKernel( + "static launch does not support launch attributes" + ) + cubin_location = os.path.join( triton_cache_dir(triton_meta.get("device", 0)), triton_hash_to_path_key(kernel.hash), diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py index 86494ba967d0..9fcaf698a21c 100644 --- a/torch/_inductor/utils.py +++ b/torch/_inductor/utils.py @@ -3749,4 +3749,6 @@ def is_nonfreeable_buffers(dep: Dep) -> bool: # before checking for known strings. if V.graph.name: dep_name = dep_name.removeprefix(V.graph.name + "_") - return dep_name.startswith(("primals_", "arg", "fwd_rng_state", "bwd_rng_state")) + return dep_name.startswith( + ("primals_", "arg", "fwd_rng_state", "bwd_rng_state", "tangents") + ) diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py index 18455b519415..59ab6302624d 100644 --- a/torch/_refs/__init__.py +++ b/torch/_refs/__init__.py @@ -1997,9 +1997,13 @@ def clamp_max( # https://pytorch.org/docs/stable/generated/torch.where.html -# TODO: implement alternate where -@register_decomposition(aten.where) -@out_wrapper() +# TODO: implement where.default +@register_decomposition(aten.where.self) +@register_decomposition(aten.where.ScalarSelf) +@register_decomposition(aten.where.ScalarOther) +@register_decomposition(aten.where.Scalar) +@register_decomposition(aten.where.self_out) +@out_wrapper(exact_dtype=True) @elementwise_type_promotion_wrapper( type_promoting_args=("a", "b"), type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH, @@ -2259,11 +2263,14 @@ def _reduction( dims = (dims,) # type: ignore[assignment] dims = utils.reduction_dims(a.shape, dims) if not has_identity: - valid_shape = a.ndim == 0 or builtins.all(a.shape[i] for i in dims) - if not valid_shape: - raise RuntimeError( - "reducing over zero-size dimension for reduction operation without identity" - ) + from torch.fx.experimental.symbolic_shapes import sym_and + + valid_shape = a.ndim == 0 or sym_and(*(a.shape[i] > 0 for i in dims)) + torch._check( + valid_shape, + lambda: "reducing over zero-size dimension for reduction operation without identity", + ) + computation_dtype, result_dtype = utils.reduction_dtypes( a, output_dtype_kind, dtype ) diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 128fab6593b3..99dd56e07ffd 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -3515,11 +3515,35 @@ Example:: py::arg("rank"), py::arg("size"), py::arg("options"), - R"(Create a new ProcessGroupXCCL instance.)"); + R"(Create a new ProcessGroupXCCL instance.)") + .def( + py::init([](const c10::intrusive_ptr<::c10d::Store>& store, + int rank, + int size) { + // gil_scoped_release is not safe as a call_guard in init. + // https://github.com/pybind/pybind11/issues/5473 + py::gil_scoped_release nogil{}; + + auto options = ::c10d::ProcessGroupXCCL::Options::create(); + options->is_high_priority_stream = false; + return c10::make_intrusive<::c10d::ProcessGroupXCCL>( + store, rank, size, options); + }), + py::arg("store"), + py::arg("rank"), + py::arg("size"), + R"(Create a new ProcessGroupXCCL instance.)") + .def_property_readonly( + "options", + &::c10d::ProcessGroupXCCL::getOptions, + R"(Return the options used to create this ProcessGroupXCCL instance.)"); intrusive_ptr_class_<::c10d::ProcessGroupXCCL::Options>( processGroupXCCL, "Options", backendOptions) - .def(py::init<>()); + .def(py::init(), py::arg("is_high_priority_stream") = false) + .def_readwrite( + "is_high_priority_stream", + &::c10d::ProcessGroupXCCL::Options::is_high_priority_stream); module .def( "_dump_xccl_trace", diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index 53cee2632b11..a51cfaf8c5c1 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -936,6 +936,9 @@ static bool is_int_or_symint(PyObject* obj) { if (torch::is_symint(py::handle(obj))) { return true; } + if (torch::is_dynint(py::handle(obj))) { + return true; + } // FakeTensor(..., size=()) is qualified for SymInt param, // but we can't go via __index__ (below) as we would normally @@ -1070,7 +1073,8 @@ auto FunctionParameter::_check( return !var.requires_grad() && var.dim() == 0; } if (torch::is_symfloat(py::handle(obj)) || - torch::is_symint(py::handle(obj))) { + torch::is_symint(py::handle(obj)) || + torch::is_dynint(py::handle(obj))) { // This will induce a guard return true; } @@ -1085,7 +1089,8 @@ auto FunctionParameter::_check( return at::isIntegralType(var.scalar_type(), /*includeBool=*/false) && !var.requires_grad() && var.dim() == 0; } - if (torch::is_symint(py::handle(obj))) { + if (torch::is_symint(py::handle(obj)) || + torch::is_dynint(py::handle(obj))) { // This will induce a guard return true; } @@ -1127,7 +1132,8 @@ auto FunctionParameter::_check( // Allow symint to be passed in as device, but we'll specialize and // guard in this case. return THPUtils_checkLong(obj) || THPUtils_checkString(obj) || - THPDevice_Check(obj) || torch::is_symint(py::handle(obj)); + THPDevice_Check(obj) || torch::is_symint(py::handle(obj)) || + torch::is_dynint(py::handle(obj)); case ParameterType::STREAM: return THPStream_Check(obj); case ParameterType::STRING: @@ -1881,7 +1887,8 @@ at::Tensor PythonArgs::tensor_slow(int i) { // NB: we DO NOT put symbolic ints/floats into the Scalar itself, // because although Scalar supports SymInt/SymFloat, the subsequent // conversion to Tensor does not. Instead, do it out of band. - } else if (torch::is_symint(py::handle(obj))) { + } else if ( + torch::is_symint(py::handle(obj)) || torch::is_dynint(py::handle(obj))) { save_symint = true; // This scalar value doesn't matter, it shouldn't ever actually // get read out. Make it a big and weird looking number to help @@ -1969,6 +1976,10 @@ at::Scalar PythonArgs::scalar_slow(PyObject* arg) { return at::Scalar(py::cast(arg)); } + if (torch::is_dynint(arg)) { + return at::Scalar(py::cast(arg)); + } + if (torch::is_symfloat(arg)) { return at::Scalar(py::cast(arg)); } diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index a81f861ae903..5887235f72e5 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -89,7 +89,7 @@ inline bool THPUtils_checkScalar(PyObject* obj) { } #endif return PyFloat_Check(obj) || PyLong_Check(obj) || PyComplex_Check(obj) || - torch::is_symint(py::handle(obj)) || + torch::is_symint(py::handle(obj)) || torch::is_dynint(py::handle(obj)) || torch::is_symfloat(py::handle(obj)) || torch::is_symbool(py::handle(obj)); } @@ -612,6 +612,8 @@ inline std::vector PythonArgs::symintlist(int i) { try { if (is_symint(py::handle(obj))) { res.push_back(py::handle(obj).cast()); + } else if (is_dynint(py::handle(obj))) { + res.push_back(py::handle(obj).cast()); } else { res.emplace_back(THPUtils_unpackIndex(obj)); } @@ -640,6 +642,9 @@ inline std::vector PythonArgs::intlistWithDefault( size1, py::handle(arg).cast().guard_int(__FILE__, __LINE__)); } + if (size1 > 0 && torch::is_dynint(py::handle(arg))) { + return std::vector(size1, py::handle(arg).cast()); + } auto tuple = PyTuple_Check(arg); // NOLINTNEXTLINE(bugprone-branch-clone) const auto size2 = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg); @@ -672,6 +677,8 @@ inline std::vector PythonArgs::intlistWithDefault( } else if (torch::is_symint(py::handle(obj))) { res[idx] = py::cast(py::handle(obj)) .guard_int(__FILE__, __LINE__); + } else if (torch::is_dynint(py::handle(obj))) { + res[idx] = py::handle(obj).cast(); } else if (THPVariable_Check(obj)) { auto& var = THPVariable_Unpack(obj); if (var.numel() != 1 || @@ -846,6 +853,10 @@ inline at::Device toDevice(PyObject* obj) { py::cast(py::handle(obj)).guard_int(__FILE__, __LINE__); return deviceFromLong(device_index); } + if (torch::is_dynint(py::handle(obj))) { + auto device_index = py::cast(py::handle(obj)); + return deviceFromLong(device_index); + } const std::string& device_str = THPUtils_unpackString(obj); return at::Device(device_str); } @@ -982,6 +993,9 @@ inline int64_t PythonArgs::toInt64(int i) { return py::cast(py::handle(args[i])) .guard_int(__FILE__, __LINE__); } + if (torch::is_dynint(py::handle(args[i]))) { + return py::cast(py::handle(args[i])); + } return THPUtils_unpackLong(args[i]); } @@ -1055,6 +1069,9 @@ inline double PythonArgs::toDouble(int i) { return static_cast(py::cast(py::handle(args[i])) .guard_int(__FILE__, __LINE__)); } + if (torch::is_dynint(py::handle(args[i]))) { + return static_cast(py::cast(py::handle(args[i]))); + } return THPUtils_unpackDouble(args[i]); } diff --git a/torch/csrc/utils/python_symnode.cpp b/torch/csrc/utils/python_symnode.cpp index 2c12e730abb1..9e17f8166a4b 100644 --- a/torch/csrc/utils/python_symnode.cpp +++ b/torch/csrc/utils/python_symnode.cpp @@ -53,4 +53,24 @@ py::handle get_symbool_class() { #endif } +py::handle get_dynint_class() { + // NB: leak +#if IS_PYBIND_2_13_PLUS + PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store + storage; + return storage + .call_once_and_store_result([]() -> py::object { + return py::module::import("torch.fx.experimental.sym_node") + .attr("DynamicInt"); + }) + .get_stored(); +#else + static py::handle symbool_class = + py::object(py::module::import("torch.fx.experimental.sym_node") + .attr("DynamicInt")) + .release(); + return symbool_class; +#endif +} + } // namespace torch diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h index 69d03b9b7a43..4b0237446770 100644 --- a/torch/csrc/utils/python_symnode.h +++ b/torch/csrc/utils/python_symnode.h @@ -12,6 +12,7 @@ namespace torch { TORCH_PYTHON_API py::handle get_symint_class(); TORCH_PYTHON_API py::handle get_symfloat_class(); TORCH_PYTHON_API py::handle get_symbool_class(); +TORCH_PYTHON_API py::handle get_dynint_class(); // NB: These functions must not be called too early, otherwise torch not setup. // Alternate design is to have torch "register" the object to us @@ -24,6 +25,9 @@ inline bool is_symfloat(py::handle obj) { inline bool is_symbool(py::handle obj) { return py::isinstance(obj, get_symbool_class()); } +inline bool is_dynint(py::handle obj) { + return py::isinstance(obj, get_dynint_class()); +} namespace impl { diff --git a/torch/cuda/tunable.py b/torch/cuda/tunable.py index 99f469d46dc1..a1fbd4fdddc2 100644 --- a/torch/cuda/tunable.py +++ b/torch/cuda/tunable.py @@ -591,7 +591,6 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None: transA = layout[1] == "T" dtype = dtype_dict.get(data_type) if data_type == "tf32": - # User must still set HIPBLASLT_ALLOW_TF32=1 torch.backends.cuda.matmul.allow_tf32 = True else: torch.backends.cuda.matmul.allow_tf32 = False diff --git a/torch/distributed/_symmetric_memory/_nvshmem_triton.py b/torch/distributed/_symmetric_memory/_nvshmem_triton.py index b1fe5d067670..4bad8ff0ceb8 100644 --- a/torch/distributed/_symmetric_memory/_nvshmem_triton.py +++ b/torch/distributed/_symmetric_memory/_nvshmem_triton.py @@ -232,8 +232,8 @@ def putmem_signal_block( # type: ignore[no-untyped-def] dst, src, size_bytes, - sig_addr, signal, + sig_val, sig_op, pe, ): # type: ignore[no-untyped-def] @@ -245,10 +245,10 @@ def putmem_signal_block( # type: ignore[no-untyped-def] This enables efficient point-to-point synchronization between PEs. Args: - dst (int64): Symmetric address of the destination data object on the remote PE. - src (int64): Local address of the source data object containing data to be copied. + dst (tensor): A tensor on calling PE symmetric to the destination tensor on remote PE. + src (tensor): Local tensor containing the source data. size_bytes (int64): Number of bytes to transfer. Must be positive. - sig_addr (int64): Symmetric address of the signal variable (uint64_t) on the remote PE. + signal (tensor): Symmetric signal pad with remote PE. Must be 8-byte aligned symmetric memory. signal (int64): Value to be used in the signal operation. sig_op (int32): Signal operation type. Common values: @@ -276,13 +276,14 @@ def putmem_signal_block( # type: ignore[no-untyped-def] ) ``` """ - signal_64 = 0 << 32 | signal + # Ensure sig_val is 64 bits + sig_val = 0 << 32 | sig_val return putmem_signal_block_extern_wrapper( - dst, - src, + dst.to(tl.int64), + src.to(tl.int64), size_bytes.to(tl.int64), - sig_addr, - signal_64.to(tl.uint64), + signal.to(tl.int64), + sig_val.to(tl.uint64), sig_op, pe, ) @@ -292,8 +293,8 @@ def putmem_signal_block_extern_wrapper( # type: ignore[no-untyped-def] dst, src, size_bytes, - sig_addr, signal, + sig_val, sig_op, pe, _semantic=None, @@ -301,7 +302,7 @@ def putmem_signal_block_extern_wrapper( # type: ignore[no-untyped-def] return core.extern_elementwise( "", "", - [dst, src, size_bytes, sig_addr, signal, sig_op, pe], + [dst, src, size_bytes, signal, sig_val, sig_op, pe], { ( core.dtype("int64"), @@ -375,7 +376,7 @@ def wait_until_extern_wrapper(ivar, cmp, cmp_val, _semantic=None): # type: igno ) @triton.jit # type: ignore[misc] - def signal_wait_until(sig_addr, cmp, cmp_val): # type: ignore[no-untyped-def] + def signal_wait_until(signal, cmp, cmp_val): # type: ignore[no-untyped-def] """ Wait until a signal variable meets a specified condition. @@ -385,7 +386,7 @@ def signal_wait_until(sig_addr, cmp, cmp_val): # type: ignore[no-untyped-def] with signal operations. Args: - sig_addr (int64): Symmetric address of the signal variable (uint64_t). + signal (tensor): Symmetric signal tensor with remote PE. Must be 8-byte aligned symmetric memory. cmp (int32): Comparison operator. Common values: - NVSHMEM_CMP_EQ (0): Wait until signal == cmp_val @@ -414,14 +415,16 @@ def signal_wait_until(sig_addr, cmp, cmp_val): # type: ignore[no-untyped-def] ``` """ cmp_val = 0 << 32 | cmp_val - return signal_wait_until_extern_wrapper(sig_addr, cmp, cmp_val.to(tl.uint64)) + return signal_wait_until_extern_wrapper( + signal.to(tl.int64), cmp, cmp_val.to(tl.uint64) + ) @core.extern - def signal_wait_until_extern_wrapper(sig_addr, cmp, cmp_val, _semantic=None): # type: ignore[no-untyped-def] + def signal_wait_until_extern_wrapper(signal, cmp, cmp_val, _semantic=None): # type: ignore[no-untyped-def] return core.extern_elementwise( "", "", - [sig_addr, cmp, cmp_val], + [signal, cmp, cmp_val], { ( core.dtype("int64"), diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py index db8f2bf722f0..ee6f3299e988 100644 --- a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py +++ b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py @@ -833,10 +833,24 @@ def reset_sharded_param(self): if local_tensor.is_meta: return updated_local_tensor = False + # local_tensor can be padded twice + # 1st time in fully_shard(model) + # 2nd time in model(input) lazy_init + # 2nd time should be no-op if parameters remain unchanged + # 2nd time shouldn't be no-op if people call model.load_state_dict(...) before lazy_init + # this makes it possible for trainer to call `sd = model.state_dict()` before the training loop + # and use `sd` without calling .state_dict() per iteration + same_local_tensor = False + # TODO: need to support tensor subclass + if type(self._sharded_param_data) is torch.Tensor: + same_local_tensor = ( + self._sharded_param_data.untyped_storage().data_ptr() + == local_tensor.untyped_storage().data_ptr() + ) padded_sharded_size = self.padded_sharded_param_size shard_dim = self.fsdp_placement.dim length = local_tensor.size(shard_dim) if local_tensor.numel() > 0 else 0 - if local_tensor.size() != padded_sharded_size: + if local_tensor.size() != padded_sharded_size and not same_local_tensor: assert shard_dim == 0, ( f"Shard({shard_dim}) requires even sharding: {local_tensor.size()=}" ) @@ -849,7 +863,8 @@ def reset_sharded_param(self): if self.pin_memory and not local_tensor.is_pinned(): local_tensor = local_tensor.cpu().pin_memory() updated_local_tensor = True - self._sharded_param_data = local_tensor.view(-1) + if not same_local_tensor: + self._sharded_param_data = local_tensor.view(-1) assert isinstance(self.sharded_param, DTensor) # mypy if updated_local_tensor: # Only change the local tensor object if needed diff --git a/torch/distributed/tensor/_dispatch.py b/torch/distributed/tensor/_dispatch.py index 9703c412657f..7e956254fb95 100644 --- a/torch/distributed/tensor/_dispatch.py +++ b/torch/distributed/tensor/_dispatch.py @@ -521,5 +521,7 @@ def _try_replicate_spec_for_scalar_tensor( raise RuntimeError( f"{op_call}: got mixed torch.Tensor and DTensor, need to convert all" " torch.Tensor to DTensor before calling distributed operators!" + " Please see https://docs.pytorch.org/docs/main/distributed.tensor.html#mixed-tensor-and-dtensor-operations" + " for more details." ) return replication_spec diff --git a/torch/distributed/tensor/_ops/_math_ops.py b/torch/distributed/tensor/_ops/_math_ops.py index 1e6eb40939e4..b43a403eae2a 100644 --- a/torch/distributed/tensor/_ops/_math_ops.py +++ b/torch/distributed/tensor/_ops/_math_ops.py @@ -22,6 +22,7 @@ expand_to_full_mesh_op_strategy, generate_redistribute_costs, is_tensor_evenly_shardable, + is_tensor_evenly_shardable_on_dim, normalize_dim, normalize_dims, register_op_strategy, @@ -268,6 +269,15 @@ def common_reduction_strategy( reduction_strategy = OpStrategy([]) for op_spec in input_strategy.strategies: + if reduction_op == "avg": + output_spec = op_spec.output_spec + local_shape = list(output_spec.tensor_meta.shape) # type:ignore[union-attr] + for dim in reduce_dims: + if not is_tensor_evenly_shardable_on_dim(local_shape, output_spec, dim): + # reduce(avg) is not linear for unevenly sharded tensors + reduction_linear = False + break + if not reduction_linear: # input placements for this strategy should clear out pending sum and sharding # on the reduction dimension @@ -310,6 +320,7 @@ def common_reduction_strategy( aten.prod.default: "product", aten.prod.dim_int: "product", aten.prod.int_out: "product", + # avg is only linear when there is no padding aten.mean.default: "avg", aten.mean.dim: "avg", aten.mean.out: "avg", diff --git a/torch/distributed/tensor/_ops/utils.py b/torch/distributed/tensor/_ops/utils.py index 2d05b62aef44..1e6540a82c02 100644 --- a/torch/distributed/tensor/_ops/utils.py +++ b/torch/distributed/tensor/_ops/utils.py @@ -194,6 +194,22 @@ def is_tensor_evenly_shardable(shape: Sequence[int], spec: DTensorSpec) -> bool: return True +def is_tensor_evenly_shardable_on_dim( + shape: Sequence[int], spec: DTensorSpec, dim: int +) -> bool: + """Check if the shape is evenly shardable according to the spec on dim.""" + dim = normalize_dim(dim, len(shape)) + + num_shards = 1 + for i, placement in enumerate(spec.placements): + if placement.is_shard(): + shard_dim = cast(Shard, placement).dim + if shard_dim == dim: + num_shards *= spec.mesh.size(i) + + return shape[dim] % num_shards == 0 + + def is_tensor_dim_sharded(spec: DTensorSpec, dim: int) -> bool: """Return True if tensor dim is sharded.""" return any(p.is_shard(dim) for p in spec.placements) diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py index 891bfe91e7f7..1822c0ed4f66 100644 --- a/torch/distributed/tensor/experimental/_attention.py +++ b/torch/distributed/tensor/experimental/_attention.py @@ -2,27 +2,18 @@ import itertools import logging import types -import weakref from abc import ABC, abstractmethod from collections.abc import Generator from dataclasses import dataclass from enum import auto, Enum -from typing import Any, Callable, Optional, Protocol, Union +from typing import Any, Callable, Optional, Protocol import torch import torch.distributed as dist import torch.distributed._functional_collectives as ft_c import torch.nn.functional as F -from torch import nn from torch.distributed.device_mesh import DeviceMesh -from torch.distributed.tensor import ( - distribute_module, - distribute_tensor, - DTensor, - Replicate, - Shard, -) -from torch.distributed.tensor.parallel.style import ParallelStyle +from torch.distributed.tensor import distribute_tensor, DTensor, Shard from torch.nn.attention.flex_attention import ( _mask_mod_signature, BlockMask, @@ -73,10 +64,6 @@ class _ContextParallelOptions: @dataclass class _ContextParallelGlobalVars: - # The current context parallel impl requires a record of some info - # as global vars. This dataclass stores those variables. - # TODO: this var should be able to stored in CP context - cp_shard_dim: int = 0 # This variable stores the TorchFunctionMode singleton because using multiple TF # instances for dispatching may trigger recompilations torch_function_mode: Optional[TorchFunctionMode] = None @@ -85,11 +72,6 @@ class _ContextParallelGlobalVars: _cp_global_vars = _ContextParallelGlobalVars() -def _set_cp_global_var(name: str, value: Any) -> None: - """Set a global variable for context parallelism.""" - setattr(_cp_global_vars, name, value) - - def _is_causal_behavior( rank: int, world_size: int, i: int, is_causal: bool ) -> _CausalBehavior: @@ -1014,108 +996,6 @@ def _enable_cp_dispatcher() -> Generator[None, None, None]: DTensor._op_dispatcher._custom_op_handlers = old_handlers -class _AttentionContextParallel(ParallelStyle): - """ - Applies context parallel optimizations to the attention layer. - - This will work for nn.MultiHeadedAttention and custom attention layers that - call F.scaled_dotproduct_attention with a similar signature. - - This expects the `forward` method consumes either: - - * a single tensor for self attention - * one argument for each of: query, key, value - - This currently only supports ring attention and the - SDPBackend.FLASH_ATTENTION backend. See sdpa_kernel. - - Non-flash attention backends will result in incorrect results. - """ - - # use a weakref dictionary to store context managers for each nn.Module - _CONTEXT_MANAGERS: "weakref.WeakKeyDictionary[nn.Module, Any]" = ( - weakref.WeakKeyDictionary() - ) - - def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module: - if not device_mesh.ndim == 1: - raise ValueError("CP only supports single dimension device mesh") - - return distribute_module( - module, - device_mesh, - input_fn=self._input_fn, # type: ignore[arg-type] - output_fn=self._output_fn, # type: ignore[arg-type] - ) - - @classmethod - def _input_fn( - cls, - module: nn.Module, - inputs: tuple[Union[torch.Tensor, int, float], ...], - device_mesh: DeviceMesh, - ) -> tuple[Union[torch.Tensor, int, float], ...]: - # TODO(d4l3k); this should be Shard(2), need to fix Linear layer rules - placement = [Replicate()] - - def backward_hook(grad: torch.Tensor) -> None: - if module in cls._CONTEXT_MANAGERS: - cls._CONTEXT_MANAGERS[module].__exit__(None, None, None) - del cls._CONTEXT_MANAGERS[module] - - # convert inputs to DTensor - inp = [] - for input in inputs: - if isinstance(input, torch.Tensor) and not isinstance(input, DTensor): - input = DTensor.from_local( - input.contiguous(), device_mesh, placement, run_check=False - ) - - if isinstance(input, torch.Tensor) and input.requires_grad: - input.register_hook(backward_hook) - - inp.append(input) - - manager = _enable_cp_dispatcher() - manager.__enter__() - cls._CONTEXT_MANAGERS[module] = manager - - return tuple(inp) - - @classmethod - def _output_fn( - cls, - module: nn.Module, - outputs: Union[torch.Tensor, tuple[Union[torch.Tensor, int, float], ...]], - device_mesh: DeviceMesh, - ) -> Union[ - Union[torch.Tensor, int, float], tuple[Union[torch.Tensor, int, float], ...] - ]: - cls._CONTEXT_MANAGERS[module].__exit__(None, None, None) - del cls._CONTEXT_MANAGERS[module] - - def backward_hook(grad: torch.Tensor) -> None: - if module not in cls._CONTEXT_MANAGERS: - manager = _enable_cp_dispatcher() - manager.__enter__() - cls._CONTEXT_MANAGERS[module] = manager - - # back to local tensor - out = [] - for output in [outputs] if isinstance(outputs, torch.Tensor) else outputs: - output = output.to_local() if isinstance(output, DTensor) else output - - if isinstance(output, torch.Tensor) and output.requires_grad: - output.register_hook(backward_hook) - - out.append(output) - - if isinstance(outputs, torch.Tensor): - return out[0] - - return tuple(out) - - def create_cp_block_mask( mask_mod: _mask_mod_signature, B: int, @@ -1233,12 +1113,6 @@ def attention_output_fn(mesh: DeviceMesh, outputs: Any) -> Any: return tuple(new_outputs) - def unshard(x: torch.Tensor, mesh: DeviceMesh, shard_dim: int) -> torch.Tensor: - x = x.contiguous() - all_xs = [torch.empty_like(x) for _ in range(mesh.size())] - ft_c.all_gather_inplace(all_xs, x, mesh) - return torch.cat(all_xs, dim=shard_dim) - class DistributeFunction(TorchFunctionMode): def __init__( self, @@ -1270,10 +1144,10 @@ def __torch_function__( assert isinstance(block_mask, tuple) global_key = ft_c.all_gather_tensor_autograd( - key, _cp_global_vars.cp_shard_dim, self._device_mesh + key, seq_dim, self._device_mesh ) global_value = ft_c.all_gather_tensor_autograd( - value, _cp_global_vars.cp_shard_dim, self._device_mesh + value, seq_dim, self._device_mesh ) # shape rewrite: because torch.nn.flex_attention() checks @@ -1323,7 +1197,7 @@ def __torch_function__( attention_input_fn, attention_output_fn, ) - _set_cp_global_var("torch_function_mode", tf_mode) + _cp_global_vars.torch_function_mode = tf_mode with tf_mode: with _enable_cp_dispatcher(): diff --git a/torch/fx/experimental/sym_node.py b/torch/fx/experimental/sym_node.py index 5468191163ab..b6c19b9ddeb9 100644 --- a/torch/fx/experimental/sym_node.py +++ b/torch/fx/experimental/sym_node.py @@ -49,7 +49,7 @@ sym_node_log = torch._logging.getArtifactLogger(__name__, "sym_node") -__all__ = ["SymNode", "method_to_operator", "magic_methods"] +__all__ = ["SymNode", "method_to_operator", "magic_methods", "DynamicInt"] from torch.types import py_sym_types as SymTypes @@ -625,6 +625,40 @@ def is_constant(self): return False +class _DynamicScalar: + def __new__(cls, *args): + if cls is _DynamicScalar: + raise TypeError("_DynamicScalar is an abstract base class, use DynamicInt.") + return super().__new__(cls, *args) + + +class DynamicInt(_DynamicScalar, int): + """ + User API for marking dynamic integers in `torch.compile`. + Intended to be compatible with both compile and eager mode. + + Example usage:: + + fn = torch.compile(f) + x = DynamicInt(4) + fn(x) # compiles x as a dynamic integer input; returns f(4) + """ + + def __new__(cls, val): + assert isinstance(val, int) + obj = super().__new__(cls, int(val)) + return obj + + def __repr__(self): + return f"DynamicInt({self.real})" + + def __floordiv__(self, other): # // was casting to int without these overrides? + return DynamicInt(self.real // other) + + def __rfloordiv__(self, other): + return DynamicInt(other // self.real) + + # TODO: this probably needs the sizes-strides eval functions METHOD_TO_OPERATOR = { "pos": operator.pos, @@ -1650,7 +1684,6 @@ def sizes_strides_user(sizes, strides): def _make_user_magic(method, user_type): # User magic takes care of wrapping the other operand into a node, # so that our internal logic can assume everything is nodes - if method in magic_methods_on_operator_with_trailing_underscore: method_attr = f"sym_{method}" else: @@ -1781,7 +1814,7 @@ def rbinary_magic_impl(self, other): other = promote(other) self, other = promote2(self, other) if is_constant(self): - return (method_to_operator(method))(get_constant(self), other) + return (method_to_operator(method))(other, get_constant(self)) if is_constant(other): other = get_constant(other) other_node = to_node(self.node, other) @@ -1790,11 +1823,31 @@ def rbinary_magic_impl(self, other): ret = wrap_node(getattr(other_node, method_attr)(self.node)) return get_constant(ret) if is_constant(ret) else ret + def setattrs(user_type, attr, symnode_impl): + """ + Registers the SymNode magic method on SymInt/Float/Bool, + and optionally registers a corresponding wrapped method on DynamicInt. + """ + + # SymInt/Float/Bool + setattr(user_type, attr, symnode_impl) + + # DynamicInt impl + def dynamic_int_impl(*args): + args = [x.real if isinstance(x, DynamicInt) else x for x in args] + out = getattr(int, attr)(*args) + if isinstance(out, int) and not isinstance(out, bool): + return DynamicInt(out) + return out + + if user_type is SymInt: + setattr(DynamicInt, attr, dynamic_int_impl) + if method in unary_magic_methods: - setattr(user_type, f"__{method}__", unary_magic_impl) + setattrs(user_type, f"__{method}__", unary_magic_impl) elif method in unary_nonmagic_methods: orig = getattr(user_type, method) - setattr(user_type, method, update_wrapper(unary_magic_impl, orig)) + setattrs(user_type, method, update_wrapper(unary_magic_impl, orig)) elif method == "sym_ite": def sym_ite_magic_impl(pred, then_val, else_val): @@ -1811,7 +1864,7 @@ def sym_ite_magic_impl(pred, then_val, else_val): ret = wrap_node(getattr(pred.node, method_attr)(then_node, else_node)) return get_constant(ret) if ret.node.is_constant() else ret - setattr(user_type, f"__{method}__", sym_ite_magic_impl) + setattrs(user_type, f"__{method}__", sym_ite_magic_impl) elif method == "round": def round_magic_impl(self, ndigits=None): @@ -1820,14 +1873,14 @@ def round_magic_impl(self, ndigits=None): return wrap_node(getattr(self.node, method)(ndigits)) - setattr(user_type, f"__{method}__", round_magic_impl) + setattrs(user_type, f"__{method}__", round_magic_impl) else: method_name = method if method in bitwise_ops: method_name = bitwise_ops[method] - setattr(user_type, f"__{method_name}__", binary_magic_impl) + setattrs(user_type, f"__{method_name}__", binary_magic_impl) if method in reflectable_magic_methods: - setattr(user_type, f"__r{method_name}__", rbinary_magic_impl) + setattrs(user_type, f"__r{method_name}__", rbinary_magic_impl) for method, func in magic_methods.items(): # type: ignore[assignment] diff --git a/torch/jit/_script.py b/torch/jit/_script.py index ccd967d69f4e..4c06ed240782 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -15,6 +15,7 @@ import pickle import warnings from typing import Any, Callable, Union +from typing_extensions import deprecated import torch import torch._jit_internal as _jit_internal @@ -750,6 +751,10 @@ def save(self, f, **kwargs): """ return self._c.save(str(f), **kwargs) + @deprecated( + "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \ + https://docs.pytorch.org/executorch/stable/getting-started.html" + ) def _save_for_lite_interpreter(self, *args, **kwargs): r"""Add (or update) the bytecode session to the script model. @@ -763,9 +768,23 @@ def _save_for_lite_interpreter(self, *args, **kwargs): _extra_files: Map from filename to contents which will be stored as part of 'f'. """ + warnings.warn( + "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \ + https://docs.pytorch.org/executorch/stable/getting-started.html", + DeprecationWarning, + ) return self._c._save_for_mobile(*args, **kwargs) + @deprecated( + "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \ + https://docs.pytorch.org/executorch/stable/getting-started.html" + ) def _save_to_buffer_for_lite_interpreter(self, *args, **kwargs): + warnings.warn( + "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \ + https://docs.pytorch.org/executorch/stable/getting-started.html", + DeprecationWarning, + ) return self._c._save_to_buffer_for_mobile(*args, **kwargs) def save_to_buffer(self, *args, **kwargs): diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py index be284429114f..846d2b407684 100644 --- a/torch/testing/_internal/common_cuda.py +++ b/torch/testing/_internal/common_cuda.py @@ -181,9 +181,6 @@ def tf32_off(): @contextlib.contextmanager def tf32_on(self, tf32_precision=1e-5): - if torch.version.hip: - hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None) - os.environ["HIPBLASLT_ALLOW_TF32"] = "1" old_allow_tf32_matmul = torch.backends.cuda.matmul.allow_tf32 old_precision = self.precision try: @@ -192,11 +189,6 @@ def tf32_on(self, tf32_precision=1e-5): with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=True): yield finally: - if torch.version.hip: - if hip_allow_tf32 is not None: - os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32 - else: - del os.environ["HIPBLASLT_ALLOW_TF32"] torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32_matmul self.precision = old_precision @@ -246,7 +238,7 @@ def tf32_enabled(): # if device is specified, it will check if device is cuda # if dtype is specified, it will check if dtype is float32 or complex64 # tf32 and fp32 are different only when all the three checks pass -def tf32_on_and_off(tf32_precision=1e-5, only_if=True): +def tf32_on_and_off(tf32_precision=1e-5, *, only_if=True): def with_tf32_disabled(self, function_call): with tf32_off(): function_call() diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py index 8de13414dd47..1f5d1ef1bdbd 100644 --- a/torch/testing/_internal/distributed/multi_threaded_pg.py +++ b/torch/testing/_internal/distributed/multi_threaded_pg.py @@ -71,6 +71,24 @@ def bitwise_reduce(tensors, op): } +# Note [Hide collectives mutation from autograd] +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Threaded PG is intended to closely simulate the behavior of regular process +# groups. However, our regular PG implementations perform a dispatch through +# c10d, whereas Threaded PG does not for some reason (some superficial +# but not very convincing reasons include that Threaded PG is implemented +# in Python but you can't override Backend in Python, you can only override +# ProcessGroup in Python), thereby bypassing the dispatch step. Now we have +# a problem: c10d's signatures are LIES, they mutate their (output) tensor +# arguments but their annotations don't have mutations on them so we don't +# actually update any view metadata if you do differentiation. This +# ordinarily "doesn't matter" because distributed collectives aren't +# differentiable anyway, but it's possible to tickle this in testing if +# someone tries to touch the grad_fn of a Tensor. There a few ways to +# fix this, but the easiest way was to use the .detach() trick to hide +# the mutations from autograd. + + class AllToAll: @torch.no_grad() def work(self, data): @@ -79,7 +97,10 @@ def work(self, data): output_tensor_list, _ = data[dest_rank] for src_rank in range(world_size): _, input_tensor_list = data[src_rank] - output_tensor_list[src_rank].copy_(input_tensor_list[dest_rank]) + # See Note [Hide collectives mutation from autograd] + output_tensor_list[src_rank].detach().copy_( + input_tensor_list[dest_rank] + ) class AllToAllBase: @@ -99,9 +120,10 @@ def work(self, data): input_buffer.size(0), input_split_sizes, world_size ) + # See Note [Hide collectives mutation from autograd] output_buffer[ output_indexes[src_rank] : output_indexes[src_rank + 1] - ].copy_( + ].detach().copy_( input_buffer[ input_indexes[dest_rank] : input_indexes[dest_rank + 1] ] @@ -152,7 +174,8 @@ def work(self, data): # copy all the reduced value to each rank for src_rank in range(len(data)): - data[src_rank][i].copy_(res.to(data[src_rank][i].device)) + # See Note [Hide collectives mutation from autograd] + data[src_rank][i].detach().copy_(res.to(data[src_rank][i].device)) class AllGather: @@ -166,7 +189,8 @@ def work(self, data): for dest in data: dest_tensor = dest[0][0][src_rank] - dest_tensor.copy_(src_tensor) + # See Note [Hide collectives mutation from autograd] + dest_tensor.detach().copy_(src_tensor) class Scatter: @@ -185,7 +209,8 @@ def work(self, data): # Can't handle scatter with multiple output tensor assert len(out_tensor_list) == 1 dest_tensor = out_tensor_list[0] - dest_tensor.copy_(src_in_tensors[rank]) + # See Note [Hide collectives mutation from autograd] + dest_tensor.detach().copy_(src_in_tensors[rank]) class Gather: @@ -202,7 +227,8 @@ def work(self, data): # Can't handle gather with multiple tensor lists assert len(src_in_tensor_list) == 1 dest_tensor = out_tensor_list[rank] - dest_tensor.copy_(src_in_tensor_list[0]) + # See Note [Hide collectives mutation from autograd] + dest_tensor.detach().copy_(src_in_tensor_list[0]) class ReduceScatter: @@ -224,14 +250,21 @@ def work(self, data): assert len(dest_tensor_on_rank_i) == 1 dst_tensor_device = dest_tensor_on_rank_i[0].device if not start_reduction[i]: - dest_tensor_on_rank_i[0].copy_(to_scatter[i].to(dst_tensor_device)) + # See Note [Hide collectives mutation from autograd] + dest_tensor_on_rank_i[0].detach().copy_( + to_scatter[i].to(dst_tensor_device) + ) start_reduction[i] = True else: - dest_tensor_on_rank_i[0].add_(to_scatter[i].to(dst_tensor_device)) + # See Note [Hide collectives mutation from autograd] + dest_tensor_on_rank_i[0].detach().add_( + to_scatter[i].to(dst_tensor_device) + ) if self.op == dist.ReduceOp.AVG: num_ranks = len(data) for each_rank_data in data: - each_rank_data[0][0] /= num_ranks + # See Note [Hide collectives mutation from autograd] + each_rank_data[0][0].detach().div_(num_ranks) class Broadcast: @@ -242,9 +275,12 @@ def __init__(self, src): def work(self, data): in_tensor_list = flatten_list(data[self.src]) for i in range(len(data)): + if i == self.src: + continue out_tensor_list = flatten_list(data[i]) for j in range(len(in_tensor_list)): - out_tensor_list[j].copy_(in_tensor_list[j]) + # See Note [Hide collectives mutation from autograd] + out_tensor_list[j].detach().copy_(in_tensor_list[j]) class Collective: diff --git a/torch/utils/bottleneck/__init__.py b/torch/utils/bottleneck/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/torch/utils/bottleneck/__main__.py b/torch/utils/bottleneck/__main__.py deleted file mode 100644 index d8bc43be0e2b..000000000000 --- a/torch/utils/bottleneck/__main__.py +++ /dev/null @@ -1,229 +0,0 @@ -# mypy: allow-untyped-defs -import argparse -import cProfile -import pstats -import sys -import os - -import torch -from torch.autograd import profiler -from torch.utils.collect_env import get_env_info - - -def redirect_argv(new_argv): - sys.argv[:] = new_argv[:] - - -def compiled_with_cuda(sysinfo): - if sysinfo.cuda_compiled_version: - return f'compiled w/ CUDA {sysinfo.cuda_compiled_version}' - return 'not compiled w/ CUDA' - - -env_summary = """ --------------------------------------------------------------------------------- - Environment Summary --------------------------------------------------------------------------------- -PyTorch {pytorch_version}{debug_str} {cuda_compiled} -Running with Python {py_version} and {cuda_runtime} - -`{pip_version} list` truncated output: -{pip_list_output} -""".strip() - - -def run_env_analysis(): - print('Running environment analysis...') - info = get_env_info() - - result: dict[str, str] = {} - - debug_str = '' - if info.is_debug_build: - debug_str = ' DEBUG' - - cuda_avail = '' - if info.is_cuda_available: - cuda = info.cuda_runtime_version - if cuda is not None: - cuda_avail = 'CUDA ' + cuda - else: - cuda = 'CUDA unavailable' - - pip_version = info.pip_version - pip_list_output = info.pip_packages - if pip_list_output is None: - pip_list_output = 'Unable to fetch' - - result = { - 'debug_str': debug_str, - 'pytorch_version': info.torch_version, - 'cuda_compiled': compiled_with_cuda(info), - 'py_version': f'{sys.version_info[0]}.{sys.version_info[1]}', - 'cuda_runtime': cuda_avail, - 'pip_version': pip_version, - 'pip_list_output': pip_list_output, - } - - return env_summary.format(**result) - - -def run_cprofile(code, globs, launch_blocking=False): - print('Running your script with cProfile') - prof = cProfile.Profile() - prof.enable() - exec(code, globs, None) - prof.disable() - return prof - - -cprof_summary = """ --------------------------------------------------------------------------------- - cProfile output --------------------------------------------------------------------------------- -""".strip() - - -def print_cprofile_summary(prof, sortby='tottime', topk=15): - print(cprof_summary) - cprofile_stats = pstats.Stats(prof).sort_stats(sortby) - cprofile_stats.print_stats(topk) - - -def run_autograd_prof(code, globs): - def run_prof(use_cuda=False): - with profiler.profile(use_cuda=use_cuda) as prof: - exec(code, globs, None) - return prof - - print('Running your script with the autograd profiler...') - result = [run_prof(use_cuda=False)] - if torch.cuda.is_available(): - result.append(run_prof(use_cuda=True)) - else: - result.append(None) - - return result - - -autograd_prof_summary = """ --------------------------------------------------------------------------------- - autograd profiler output ({mode} mode) --------------------------------------------------------------------------------- - {description} -{cuda_warning} -{output} -""".strip() - - -def print_autograd_prof_summary(prof, mode, sortby='cpu_time', topk=15): - valid_sortby = ['cpu_time', 'cuda_time', 'cpu_time_total', 'cuda_time_total', 'count'] - if sortby not in valid_sortby: - warn = ('WARNING: invalid sorting option for autograd profiler results: {}\n' - 'Expected `cpu_time`, `cpu_time_total`, or `count`. ' - 'Defaulting to `cpu_time`.') - print(warn.format(sortby)) - sortby = 'cpu_time' - - if mode == 'CUDA': - cuda_warning = ('\n\tBecause the autograd profiler uses the CUDA event API,\n' - '\tthe CUDA time column reports approximately max(cuda_time, cpu_time).\n' - '\tPlease ignore this output if your code does not use CUDA.\n') - else: - cuda_warning = '' - - sorted_events = sorted(prof.function_events, - key=lambda x: getattr(x, sortby), reverse=True) - topk_events = sorted_events[:topk] - - result = { - 'mode': mode, - 'description': f'top {topk} events sorted by {sortby}', - 'output': torch.autograd.profiler_util._build_table(topk_events), - 'cuda_warning': cuda_warning - } - - print(autograd_prof_summary.format(**result)) - - -descript = """ -`bottleneck` is a tool that can be used as an initial step for debugging -bottlenecks in your program. - -It summarizes runs of your script with the Python profiler and PyTorch\'s -autograd profiler. Because your script will be profiled, please ensure that it -exits in a finite amount of time. - -For more complicated uses of the profilers, please see -https://docs.python.org/3/library/profile.html and -https://pytorch.org/docs/main/autograd.html#profiler for more information. -""".strip() - - -def parse_args(): - parser = argparse.ArgumentParser(description=descript) - parser.add_argument('scriptfile', type=str, - help='Path to the script to be run. ' - 'Usually run with `python path/to/script`.') - parser.add_argument('args', type=str, nargs=argparse.REMAINDER, - help='Command-line arguments to be passed to the script.') - return parser.parse_args() - - -def cpu_time_total(autograd_prof): - return sum(event.cpu_time_total for event in autograd_prof.function_events) - - -def main(): - args = parse_args() - - # Customizable constants. - scriptfile = args.scriptfile - scriptargs = [] if args.args is None else args.args - scriptargs.insert(0, scriptfile) - cprofile_sortby = 'tottime' - cprofile_topk = 15 - autograd_prof_sortby = 'cpu_time_total' - autograd_prof_topk = 15 - - redirect_argv(scriptargs) - - sys.path.insert(0, os.path.dirname(scriptfile)) - with open(scriptfile, 'rb') as stream: - code = compile(stream.read(), scriptfile, 'exec') - globs = { - '__file__': scriptfile, - '__name__': '__main__', - '__package__': None, - '__cached__': None, - } - - print(descript) - - env_summary = run_env_analysis() - - if torch.cuda.is_available(): - torch.cuda.init() - cprofile_prof = run_cprofile(code, globs) - autograd_prof_cpu, autograd_prof_cuda = run_autograd_prof(code, globs) - - print(env_summary) - print_cprofile_summary(cprofile_prof, cprofile_sortby, cprofile_topk) - - if not torch.cuda.is_available(): - print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk) - return - - # Print both the result of the CPU-mode and CUDA-mode autograd profilers - # if their execution times are very different. - cuda_prof_exec_time = cpu_time_total(autograd_prof_cuda) - if len(autograd_prof_cpu.function_events) > 0: - cpu_prof_exec_time = cpu_time_total(autograd_prof_cpu) - pct_diff = (cuda_prof_exec_time - cpu_prof_exec_time) / cuda_prof_exec_time - if abs(pct_diff) > 0.05: - print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk) - - print_autograd_prof_summary(autograd_prof_cuda, 'CUDA', autograd_prof_sortby, autograd_prof_topk) - -if __name__ == '__main__': - main() diff --git a/torch/utils/debug_mode.py b/torch/utils/debug_mode.py index 4862a394d1b1..805ec67777c7 100644 --- a/torch/utils/debug_mode.py +++ b/torch/utils/debug_mode.py @@ -81,7 +81,7 @@ class DebugMode(TorchDispatchMode): def __init__( self, *, - record_torchfunction=True, + record_torchfunction=False, record_faketensor=False, record_realtensor=True, ):